[llvm] 76274eb - [PHIElimination] Revert #131837 #146320 #146337 (#146850)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 3 04:48:29 PDT 2025


Author: Guy David
Date: 2025-07-03T07:48:08-04:00
New Revision: 76274eb2b3439aac6991c6b505248e00627e5693

URL: https://github.com/llvm/llvm-project/commit/76274eb2b3439aac6991c6b505248e00627e5693
DIFF: https://github.com/llvm/llvm-project/commit/76274eb2b3439aac6991c6b505248e00627e5693.diff

LOG: [PHIElimination] Revert #131837 #146320 #146337 (#146850)

Reverting because mis-compiles:
- https://github.com/llvm/llvm-project/pull/131837
- https://github.com/llvm/llvm-project/pull/146320
- https://github.com/llvm/llvm-project/pull/146337

Added: 
    

Modified: 
    llvm/lib/CodeGen/PHIElimination.cpp
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll
    llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
    llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
    llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
    llvm/test/CodeGen/AArch64/bfis-in-loop.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
    llvm/test/CodeGen/AArch64/phi.ll
    llvm/test/CodeGen/AArch64/pr48188.ll
    llvm/test/CodeGen/AArch64/ragreedy-csr.ll
    llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
    llvm/test/CodeGen/AArch64/reduce-or-opt.ll
    llvm/test/CodeGen/AArch64/sink-and-fold.ll
    llvm/test/CodeGen/AArch64/sve-lsrchain.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
    llvm/test/CodeGen/AArch64/swifterror.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
    llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
    llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
    llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
    llvm/test/CodeGen/AMDGPU/div_i128.ll
    llvm/test/CodeGen/AMDGPU/div_v2i128.ll
    llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
    llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
    llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
    llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
    llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
    llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
    llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
    llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
    llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
    llvm/test/CodeGen/AMDGPU/mul.ll
    llvm/test/CodeGen/AMDGPU/rem_i128.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
    llvm/test/CodeGen/AMDGPU/wave32.ll
    llvm/test/CodeGen/ARM/and-cmp0-sink.ll
    llvm/test/CodeGen/ARM/cttz.ll
    llvm/test/CodeGen/ARM/select-imm.ll
    llvm/test/CodeGen/ARM/struct-byval-loop.ll
    llvm/test/CodeGen/ARM/swifterror.ll
    llvm/test/CodeGen/AVR/bug-81911.ll
    llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
    llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
    llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
    llvm/test/CodeGen/Hexagon/swp-stages4.ll
    llvm/test/CodeGen/Hexagon/tinycore.ll
    llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
    llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
    llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll
    llvm/test/CodeGen/PowerPC/phi-eliminate.mir
    llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir
    llvm/test/CodeGen/PowerPC/pr116071.ll
    llvm/test/CodeGen/PowerPC/sms-phi-2.ll
    llvm/test/CodeGen/PowerPC/sms-phi-3.ll
    llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll
    llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
    llvm/test/CodeGen/PowerPC/vsx.ll
    llvm/test/CodeGen/RISCV/abds.ll
    llvm/test/CodeGen/RISCV/machine-pipeliner.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
    llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
    llvm/test/CodeGen/RISCV/xcvbi.ll
    llvm/test/CodeGen/SystemZ/swifterror.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
    llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
    llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
    llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
    llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
    llvm/test/CodeGen/Thumb2/mve-phireg.ll
    llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
    llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
    llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
    llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
    llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
    llvm/test/CodeGen/Thumb2/pr52817.ll
    llvm/test/CodeGen/VE/Scalar/br_jt.ll
    llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
    llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
    llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
    llvm/test/CodeGen/X86/atomic32.ll
    llvm/test/CodeGen/X86/atomic64.ll
    llvm/test/CodeGen/X86/atomic6432.ll
    llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
    llvm/test/CodeGen/X86/callbr-asm-kill.mir
    llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
    llvm/test/CodeGen/X86/combine-pmuldq.ll
    llvm/test/CodeGen/X86/fp128-select.ll
    llvm/test/CodeGen/X86/madd.ll
    llvm/test/CodeGen/X86/masked_load.ll
    llvm/test/CodeGen/X86/min-legal-vector-width.ll
    llvm/test/CodeGen/X86/pcsections-atomics.ll
    llvm/test/CodeGen/X86/pr15705.ll
    llvm/test/CodeGen/X86/pr32256.ll
    llvm/test/CodeGen/X86/pr38795.ll
    llvm/test/CodeGen/X86/pr49451.ll
    llvm/test/CodeGen/X86/pr63108.ll
    llvm/test/CodeGen/X86/sad.ll
    llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
    llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll
    llvm/test/CodeGen/X86/swifterror.ll
    llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
    llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
    llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll

Removed: 
    llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir
    llvm/test/CodeGen/AArch64/block-layout-regression.mir


################################################################################
diff  --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index 86523c22a419d..a93a89ecaa96e 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -581,20 +581,6 @@ void PHIEliminationImpl::LowerPHINode(MachineBasicBlock &MBB,
       continue;
     }
 
-    // Reuse an existing copy in the block if possible.
-    if (IncomingReg.isVirtual()) {
-      MachineInstr *DefMI = MRI->getUniqueVRegDef(SrcReg);
-      const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
-      const TargetRegisterClass *IncomingRC = MRI->getRegClass(IncomingReg);
-      if (DefMI && DefMI->isCopy() && DefMI->getParent() == &opBlock &&
-          MRI->use_empty(SrcReg) && IncomingRC->hasSuperClassEq(SrcRC)) {
-        DefMI->getOperand(0).setReg(IncomingReg);
-        if (LV)
-          LV->getVarInfo(SrcReg).AliveBlocks.clear();
-        continue;
-      }
-    }
-
     // Find a safe location to insert the copy, this may be the first terminator
     // in the block (or end()).
     MachineBasicBlock::iterator InsertPos =

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll
index 6c300b04508b2..c1c5c53aa7df2 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll
@@ -118,8 +118,8 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_unordered:
 ; -O0:    bl __aarch64_cas16_relax
-; -O0:    subs x9, x0, x9
-; -O0:    ccmp x1, x8, #0, eq
+; -O0:    subs x10, x10, x11
+; -O0:    ccmp x8, x9, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_unordered:
 ; -O1:    ldxp xzr, x8, [x2]
@@ -131,8 +131,8 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_monotonic:
 ; -O0:    bl __aarch64_cas16_relax
-; -O0:    subs x9, x0, x9
-; -O0:    ccmp x1, x8, #0, eq
+; -O0:    subs x10, x10, x11
+; -O0:    ccmp x8, x9, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_monotonic:
 ; -O1:    ldxp xzr, x8, [x2]
@@ -144,8 +144,8 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_release:
 ; -O0:    bl __aarch64_cas16_rel
-; -O0:    subs x9, x0, x9
-; -O0:    ccmp x1, x8, #0, eq
+; -O0:    subs x10, x10, x11
+; -O0:    ccmp x8, x9, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_release:
 ; -O1:    ldxp xzr, x8, [x2]
@@ -157,8 +157,8 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
 ; -O0:    bl __aarch64_cas16_acq_rel
-; -O0:    subs x9, x0, x9
-; -O0:    ccmp x1, x8, #0, eq
+; -O0:    subs x10, x10, x11
+; -O0:    ccmp x8, x9, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
 ; -O1:    ldaxp xzr, x8, [x2]

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll
index 2a7bbad9d6454..d1047d84e2956 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll
@@ -117,13 +117,13 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0:    ldxp x8, x10, [x13]
-; -O0:    cmp x8, x9
+; -O0:    ldxp x10, x12, [x9]
 ; -O0:    cmp x10, x11
-; -O0:    stxp w12, x14, x15, [x13]
-; -O0:    stxp w12, x8, x10, [x13]
-; -O0:    subs x10, x10, x11
-; -O0:    ccmp x8, x9, #0, eq
+; -O0:    cmp x12, x13
+; -O0:    stxp w8, x14, x15, [x9]
+; -O0:    stxp w8, x10, x12, [x9]
+; -O0:    subs x12, x12, x13
+; -O0:    ccmp x10, x11, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_unordered:
 ; -O1:    ldxp xzr, x8, [x2]
@@ -134,13 +134,13 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x8, x10, [x13]
-; -O0:    cmp x8, x9
+; -O0:    ldxp x10, x12, [x9]
 ; -O0:    cmp x10, x11
-; -O0:    stxp w12, x14, x15, [x13]
-; -O0:    stxp w12, x8, x10, [x13]
-; -O0:    subs x10, x10, x11
-; -O0:    ccmp x8, x9, #0, eq
+; -O0:    cmp x12, x13
+; -O0:    stxp w8, x14, x15, [x9]
+; -O0:    stxp w8, x10, x12, [x9]
+; -O0:    subs x12, x12, x13
+; -O0:    ccmp x10, x11, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_monotonic:
 ; -O1:    ldxp xzr, x8, [x2]
@@ -151,13 +151,13 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0:    ldxp x8, x10, [x13]
-; -O0:    cmp x8, x9
+; -O0:    ldxp x10, x12, [x9]
 ; -O0:    cmp x10, x11
-; -O0:    stlxp w12, x14, x15, [x13]
-; -O0:    stlxp w12, x8, x10, [x13]
-; -O0:    subs x10, x10, x11
-; -O0:    ccmp x8, x9, #0, eq
+; -O0:    cmp x12, x13
+; -O0:    stlxp w8, x14, x15, [x9]
+; -O0:    stlxp w8, x10, x12, [x9]
+; -O0:    subs x12, x12, x13
+; -O0:    ccmp x10, x11, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_release:
 ; -O1:    ldxp xzr, x8, [x2]
@@ -168,13 +168,13 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
 
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x8, x10, [x13]
-; -O0:    cmp x8, x9
+; -O0:    ldaxp x10, x12, [x9]
 ; -O0:    cmp x10, x11
-; -O0:    stlxp w12, x14, x15, [x13]
-; -O0:    stlxp w12, x8, x10, [x13]
-; -O0:    subs x10, x10, x11
-; -O0:    ccmp x8, x9, #0, eq
+; -O0:    cmp x12, x13
+; -O0:    stlxp w8, x14, x15, [x9]
+; -O0:    stlxp w8, x10, x12, [x9]
+; -O0:    subs x12, x12, x13
+; -O0:    ccmp x10, x11, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
 ; -O1:    ldaxp xzr, x8, [x2]

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll
index 493bc742f7663..1a79c73355143 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll
@@ -117,13 +117,13 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_unordered:
-; -O0:    ldxp x8, x10, [x13]
-; -O0:    cmp x8, x9
+; -O0:    ldxp x10, x12, [x9]
 ; -O0:    cmp x10, x11
-; -O0:    stxp w12, x14, x15, [x13]
-; -O0:    stxp w12, x8, x10, [x13]
-; -O0:    subs x10, x10, x11
-; -O0:    ccmp x8, x9, #0, eq
+; -O0:    cmp x12, x13
+; -O0:    stxp w8, x14, x15, [x9]
+; -O0:    stxp w8, x10, x12, [x9]
+; -O0:    subs x12, x12, x13
+; -O0:    ccmp x10, x11, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_unordered:
 ; -O1:    ldxp xzr, x8, [x2]
@@ -134,13 +134,13 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_monotonic:
-; -O0:    ldxp x8, x10, [x13]
-; -O0:    cmp x8, x9
+; -O0:    ldxp x10, x12, [x9]
 ; -O0:    cmp x10, x11
-; -O0:    stxp w12, x14, x15, [x13]
-; -O0:    stxp w12, x8, x10, [x13]
-; -O0:    subs x10, x10, x11
-; -O0:    ccmp x8, x9, #0, eq
+; -O0:    cmp x12, x13
+; -O0:    stxp w8, x14, x15, [x9]
+; -O0:    stxp w8, x10, x12, [x9]
+; -O0:    subs x12, x12, x13
+; -O0:    ccmp x10, x11, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_monotonic:
 ; -O1:    ldxp xzr, x8, [x2]
@@ -151,13 +151,13 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_release:
-; -O0:    ldxp x8, x10, [x13]
-; -O0:    cmp x8, x9
+; -O0:    ldxp x10, x12, [x9]
 ; -O0:    cmp x10, x11
-; -O0:    stlxp w12, x14, x15, [x13]
-; -O0:    stlxp w12, x8, x10, [x13]
-; -O0:    subs x10, x10, x11
-; -O0:    ccmp x8, x9, #0, eq
+; -O0:    cmp x12, x13
+; -O0:    stlxp w8, x14, x15, [x9]
+; -O0:    stlxp w8, x10, x12, [x9]
+; -O0:    subs x12, x12, x13
+; -O0:    ccmp x10, x11, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_release:
 ; -O1:    ldxp xzr, x8, [x2]
@@ -168,13 +168,13 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
 
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
 ; -O0-LABEL: store_atomic_i128_aligned_seq_cst:
-; -O0:    ldaxp x8, x10, [x13]
-; -O0:    cmp x8, x9
+; -O0:    ldaxp x10, x12, [x9]
 ; -O0:    cmp x10, x11
-; -O0:    stlxp w12, x14, x15, [x13]
-; -O0:    stlxp w12, x8, x10, [x13]
-; -O0:    subs x10, x10, x11
-; -O0:    ccmp x8, x9, #0, eq
+; -O0:    cmp x12, x13
+; -O0:    stlxp w8, x14, x15, [x9]
+; -O0:    stlxp w8, x10, x12, [x9]
+; -O0:    subs x12, x12, x13
+; -O0:    ccmp x10, x11, #0, eq
 ;
 ; -O1-LABEL: store_atomic_i128_aligned_seq_cst:
 ; -O1:    ldaxp xzr, x8, [x2]

diff  --git a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
index 993d1c1f1b5f0..01c44e3f253bb 100644
--- a/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
+++ b/llvm/test/CodeGen/AArch64/PHIElimination-debugloc.mir
@@ -37,7 +37,7 @@ body: |
   bb.1:
     %x:gpr32 = COPY $wzr
   ; Test that the debug location is not copied into bb1!
-  ; CHECK: %3:gpr32 = COPY $wzr
+  ; CHECK: %3:gpr32 = COPY killed %x{{$}}
   ; CHECK-LABEL: bb.2:
   bb.2:
     %y:gpr32 = PHI %x:gpr32, %bb.1, undef %undef:gpr32, %bb.0, debug-location !14

diff  --git a/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir b/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir
deleted file mode 100644
index 20020a8ed3fb7..0000000000000
--- a/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir
+++ /dev/null
@@ -1,193 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -run-pass=livevars,phi-node-elimination -verify-machineinstrs -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
-
-# Verify that the original COPY in bb.1 is reappropriated as the PHI source in bb.2,
-# instead of creating a new COPY with the same source register.
-
----
-name: copy_virtual_reg
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: copy_virtual_reg
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; CHECK-NEXT:   liveins: $nzcv, $w0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %a:gpr32 = COPY killed $w0
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   Bcc 8, %bb.2, implicit killed $nzcv
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gpr32 = COPY killed %a
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   %c:gpr32 = COPY killed [[DEF]]
-  ; CHECK-NEXT:   dead %d:gpr32 = COPY killed %c
-  bb.0:
-    liveins: $nzcv, $w0
-    %a:gpr32 = COPY $w0
-    Bcc 8, %bb.2, implicit $nzcv
-  bb.1:
-    %b:gpr32 = COPY %a:gpr32
-  bb.2:
-    %c:gpr32 = PHI %b:gpr32, %bb.1, undef %undef:gpr32, %bb.0
-    %d:gpr32 = COPY %c:gpr32
-...
-
----
-name: copy_physical_reg
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: copy_physical_reg
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; CHECK-NEXT:   liveins: $nzcv, $w0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   Bcc 8, %bb.2, implicit killed $nzcv
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead $x0 = IMPLICIT_DEF implicit-def $w0
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gpr32 = COPY killed $w0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   dead %b:gpr32 = COPY killed [[DEF]]
-  bb.0:
-    liveins: $nzcv, $w0
-    Bcc 8, %bb.2, implicit $nzcv
-  bb.1:
-    $x0 = IMPLICIT_DEF
-    %a:gpr32 = COPY $w0
-  bb.2:
-    %b:gpr32 = PHI %a:gpr32, %bb.1, undef %undef:gpr32, %bb.0
-...
-
----
-name: copy_to_dead
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: copy_to_dead
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; CHECK-NEXT:   liveins: $wzr, $xzr
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $wzr
-  ; CHECK-NEXT:   dead [[COPY1:%[0-9]+]]:gpr64 = COPY $xzr
-  ; CHECK-NEXT:   TBZW killed [[COPY]], 0, %bb.2
-  ; CHECK-NEXT:   B %bb.1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead [[DEF:%[0-9]+]]:gpr64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[DEF1:%[0-9]+]]:gpr64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   B %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead [[DEF2:%[0-9]+]]:gpr64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[DEF3:%[0-9]+]]:gpr64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   B %bb.1
-  bb.0:
-    liveins: $wzr, $xzr
-
-    %9:gpr32 = COPY $wzr
-    dead %5:gpr64 = COPY $xzr
-    TBZW killed %9:gpr32, 0, %bb.2
-    B %bb.1
-
-  bb.1:
-    successors: %bb.2(0x80000000); %bb.2(100.00%)
-
-    dead %1:gpr64 = PHI undef %3:gpr64, %bb.2, undef %5:gpr64, %bb.0
-    dead %2:gpr64 = PHI undef %4:gpr64, %bb.2, undef %5:gpr64, %bb.0
-    B %bb.2
-
-  bb.2:
-    successors: %bb.1(0x80000000); %bb.1(100.00%)
-
-    dead %3:gpr64 = PHI undef %1:gpr64, %bb.1, undef %5:gpr64, %bb.0
-    dead %4:gpr64 = PHI undef %2:gpr64, %bb.1, undef %5:gpr64, %bb.0
-    B %bb.1
-
-...
-
----
-name: update_livevars
-tracksRegLiveness: true
-body:             |
-  ; CHECK-LABEL: name: update_livevars
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $w0, $w1, $nzcv
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY killed $w0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY killed $w1
-  ; CHECK-NEXT:   B %bb.1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; CHECK-NEXT:   liveins: $nzcv
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead [[COPY2:%[0-9]+]]:gpr32 = COPY killed [[COPY1]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
-  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $nzcv
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   B %bb.1
-  bb.0:
-    successors: %bb.1
-    liveins: $w0, $w1, $nzcv
-
-    %0:gpr32 = COPY killed $w0
-    %1:gpr32 = COPY killed $w1
-    B %bb.1
-
-  bb.1:
-    successors: %bb.2, %bb.1
-    liveins: $nzcv
-
-    %2:gpr32 = PHI %3, %bb.2, %1, %bb.0, %3, %bb.1
-    %3:gpr32 = COPY %0
-    Bcc 1, %bb.1, implicit $nzcv
-
-  bb.2:
-    successors: %bb.1
-    liveins: $nzcv
-
-    B %bb.1
-...
-
----
-name: copy_subreg
-tracksRegLiveness: true
-body:             |
-  ; CHECK-LABEL: name: copy_subreg
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $x0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY killed $x0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY killed [[COPY]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   dead [[COPY2:%[0-9]+]]:gpr32 = COPY killed [[COPY1]].sub_32
-  bb.0:
-    successors: %bb.1
-    liveins: $x0
-
-    %0:gpr64 = COPY killed $x0
-    %1:gpr64 = COPY killed %0
-
-  bb.1:
-    %2:gpr32 = PHI %1.sub_32, %bb.0
-...

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index ca1052a769408..8655bb1292ef7 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -583,8 +583,8 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
 ; CHECK-SD-NEXT:    mov w10, w2
 ; CHECK-SD-NEXT:    b.hi .LBB5_4
 ; CHECK-SD-NEXT:  // %bb.2:
-; CHECK-SD-NEXT:    mov w8, wzr
 ; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov w8, wzr
 ; CHECK-SD-NEXT:    b .LBB5_7
 ; CHECK-SD-NEXT:  .LBB5_3:
 ; CHECK-SD-NEXT:    mov w8, wzr

diff  --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
index 9fd27edae3176..71e0250b36972 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
@@ -199,16 +199,16 @@ define i128 @test_rmw_add_128(ptr %dst)   {
 ; NOLSE-NEXT:    sub sp, sp, #48
 ; NOLSE-NEXT:    .cfi_def_cfa_offset 48
 ; NOLSE-NEXT:    str x0, [sp, #24] // 8-byte Folded Spill
-; NOLSE-NEXT:    ldr x9, [x0, #8]
-; NOLSE-NEXT:    ldr x8, [x0]
+; NOLSE-NEXT:    ldr x8, [x0, #8]
+; NOLSE-NEXT:    ldr x9, [x0]
 ; NOLSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
 ; NOLSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
 ; NOLSE-NEXT:    b .LBB4_1
 ; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; NOLSE-NEXT:    // Child Loop BB4_2 Depth 2
-; NOLSE-NEXT:    ldr x13, [sp, #32] // 8-byte Folded Reload
-; NOLSE-NEXT:    ldr x11, [sp, #40] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x13, [sp, #40] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x11, [sp, #32] // 8-byte Folded Reload
 ; NOLSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
 ; NOLSE-NEXT:    adds x14, x11, #1
 ; NOLSE-NEXT:    cinc x15, x13, hs
@@ -238,8 +238,8 @@ define i128 @test_rmw_add_128(ptr %dst)   {
 ; NOLSE-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
 ; NOLSE-NEXT:    subs x12, x12, x13
 ; NOLSE-NEXT:    ccmp x10, x11, #0, eq
-; NOLSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; NOLSE-NEXT:    str x8, [sp, #32] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
 ; NOLSE-NEXT:    b.ne .LBB4_1
 ; NOLSE-NEXT:    b .LBB4_6
 ; NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
@@ -253,15 +253,15 @@ define i128 @test_rmw_add_128(ptr %dst)   {
 ; LSE-NEXT:    sub sp, sp, #48
 ; LSE-NEXT:    .cfi_def_cfa_offset 48
 ; LSE-NEXT:    str x0, [sp, #24] // 8-byte Folded Spill
-; LSE-NEXT:    ldr x9, [x0, #8]
-; LSE-NEXT:    ldr x8, [x0]
+; LSE-NEXT:    ldr x8, [x0, #8]
+; LSE-NEXT:    ldr x9, [x0]
 ; LSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
 ; LSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
 ; LSE-NEXT:    b .LBB4_1
 ; LSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    ldr x11, [sp, #32] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x10, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x11, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x10, [sp, #32] // 8-byte Folded Reload
 ; LSE-NEXT:    ldr x8, [sp, #24] // 8-byte Folded Reload
 ; LSE-NEXT:    mov x0, x10
 ; LSE-NEXT:    mov x1, x11
@@ -276,8 +276,8 @@ define i128 @test_rmw_add_128(ptr %dst)   {
 ; LSE-NEXT:    str x8, [sp, #16] // 8-byte Folded Spill
 ; LSE-NEXT:    subs x11, x8, x11
 ; LSE-NEXT:    ccmp x9, x10, #0, eq
-; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; LSE-NEXT:    str x8, [sp, #32] // 8-byte Folded Spill
+; LSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
+; LSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
 ; LSE-NEXT:    b.ne .LBB4_1
 ; LSE-NEXT:    b .LBB4_2
 ; LSE-NEXT:  .LBB4_2: // %atomicrmw.end
@@ -573,16 +573,16 @@ define i128 @test_rmw_nand_128(ptr %dst)   {
 ; NOLSE-NEXT:    sub sp, sp, #48
 ; NOLSE-NEXT:    .cfi_def_cfa_offset 48
 ; NOLSE-NEXT:    str x0, [sp, #24] // 8-byte Folded Spill
-; NOLSE-NEXT:    ldr x9, [x0, #8]
-; NOLSE-NEXT:    ldr x8, [x0]
+; NOLSE-NEXT:    ldr x8, [x0, #8]
+; NOLSE-NEXT:    ldr x9, [x0]
 ; NOLSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
 ; NOLSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
 ; NOLSE-NEXT:    b .LBB9_1
 ; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; NOLSE-NEXT:    // Child Loop BB9_2 Depth 2
-; NOLSE-NEXT:    ldr x13, [sp, #32] // 8-byte Folded Reload
-; NOLSE-NEXT:    ldr x11, [sp, #40] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x13, [sp, #40] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x11, [sp, #32] // 8-byte Folded Reload
 ; NOLSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
 ; NOLSE-NEXT:    mov w8, w11
 ; NOLSE-NEXT:    mvn w10, w8
@@ -616,8 +616,8 @@ define i128 @test_rmw_nand_128(ptr %dst)   {
 ; NOLSE-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
 ; NOLSE-NEXT:    subs x12, x12, x13
 ; NOLSE-NEXT:    ccmp x10, x11, #0, eq
-; NOLSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; NOLSE-NEXT:    str x8, [sp, #32] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
 ; NOLSE-NEXT:    b.ne .LBB9_1
 ; NOLSE-NEXT:    b .LBB9_6
 ; NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
@@ -631,15 +631,15 @@ define i128 @test_rmw_nand_128(ptr %dst)   {
 ; LSE-NEXT:    sub sp, sp, #48
 ; LSE-NEXT:    .cfi_def_cfa_offset 48
 ; LSE-NEXT:    str x0, [sp, #24] // 8-byte Folded Spill
-; LSE-NEXT:    ldr x9, [x0, #8]
-; LSE-NEXT:    ldr x8, [x0]
+; LSE-NEXT:    ldr x8, [x0, #8]
+; LSE-NEXT:    ldr x9, [x0]
 ; LSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
 ; LSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
 ; LSE-NEXT:    b .LBB9_1
 ; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    ldr x11, [sp, #32] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x10, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x11, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x10, [sp, #32] // 8-byte Folded Reload
 ; LSE-NEXT:    ldr x8, [sp, #24] // 8-byte Folded Reload
 ; LSE-NEXT:    mov x0, x10
 ; LSE-NEXT:    mov x1, x11
@@ -658,8 +658,8 @@ define i128 @test_rmw_nand_128(ptr %dst)   {
 ; LSE-NEXT:    str x8, [sp, #16] // 8-byte Folded Spill
 ; LSE-NEXT:    subs x11, x8, x11
 ; LSE-NEXT:    ccmp x9, x10, #0, eq
-; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; LSE-NEXT:    str x8, [sp, #32] // 8-byte Folded Spill
+; LSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
+; LSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
 ; LSE-NEXT:    b.ne .LBB9_1
 ; LSE-NEXT:    b .LBB9_2
 ; LSE-NEXT:  .LBB9_2: // %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
index b0339222bc2df..43d49da1abd21 100644
--- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
+++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
@@ -14,8 +14,8 @@ define i64 @bfis_in_loop_zero() {
 ; CHECK-LABEL: bfis_in_loop_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x9, :got:global
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    ldr x9, [x9, :got_lo12:global]
 ; CHECK-NEXT:    mov w10, #65536 // =0x10000
 ; CHECK-NEXT:    ldr x9, [x9]

diff  --git a/llvm/test/CodeGen/AArch64/block-layout-regression.mir b/llvm/test/CodeGen/AArch64/block-layout-regression.mir
deleted file mode 100644
index a7ab5c029b142..0000000000000
--- a/llvm/test/CodeGen/AArch64/block-layout-regression.mir
+++ /dev/null
@@ -1,85 +0,0 @@
-# RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s \
-# RUN:   -start-before=phi-node-elimination -stop-after=branch-relaxation \
-# RUN: | FileCheck %s
-
-# Verify an optimal block layout is produced for the following nested loop, when
-# there's a PHI node in bb.5 that relies on an operand in bb.2.
-# The previous layout used a CBNZX for a null check followed by a unconditional
-# branch to bb.6, instead of a fallthrough.
-
-#   [ bb.0 ENTRY ]
-#       |
-#       v
-#   [ bb.1 ] <-------+
-#     /   \          |
-#    v     v         |
-# [bb.2] [bb.3] <-+  |
-#    |     |      |  |
-#    |     v      |  |
-#    |   [bb.4] --+  |
-#    |     |         |
-#    v     v         |
-#   [  bb.5  ]       |
-#     /   \          |
-#    |     v         |
-#    |   [bb.6] -----+
-#    |     |
-#    v     v
-#   [ bb.7 RET ]
-
-# CHECK-LABEL: test
-# CHECK-NOT: CBNZX
-# CHECK-NOT: B %bb.
-# CHECK-COUNT-2: CBZX
-
----
-name: test
-tracksRegLiveness: true
-body: |
-  bb.0:
-    successors: %bb.1(0x80000000); %bb.1(100.00%)
-    liveins: $x0, $w1, $x2, $x3
-    %0:gpr64all = IMPLICIT_DEF
-    %1:gpr64common = IMPLICIT_DEF
-    %2:gpr32common = IMPLICIT_DEF
-    %3:gpr32 = IMPLICIT_DEF
-    B %bb.1
-
-  bb.1:
-    successors: %bb.2(0x30000000), %bb.3(0x50000000); %bb.2(37.50%), %bb.3(62.50%)
-    %4:gpr64common = PHI undef %0:gpr64all, %bb.0, %5:gpr64common, %bb.6
-    %6:gpr64 = LDRXui undef %1:gpr64common, 0 :: (load (s64))
-    STRXui killed %4:gpr64common, undef %1:gpr64common, 0 :: (store (s64))
-    CBNZX undef %6:gpr64, %bb.3
-
-  bb.2:
-    successors: %bb.5(0x80000000); %bb.5(100.00%)
-    %7:gpr64all = COPY killed %6:gpr64
-    B %bb.5
-
-  bb.3:
-    successors: %bb.5(0x04000000), %bb.4(0x7c000000); %bb.5(3.12%), %bb.4(96.88%)
-    dead $wzr = SUBSWrr killed undef %3:gpr32, killed undef %2:gpr32common, implicit-def $nzcv
-    Bcc 12, %bb.5, implicit killed undef $nzcv
-    B %bb.4
-
-  bb.4:
-    successors: %bb.5(0x04000000), %bb.3(0x7c000000); %bb.5(3.12%), %bb.3(96.88%)
-    dead $xzr = SUBSXrr killed undef %6:gpr64, killed undef %6:gpr64, implicit-def $nzcv
-    Bcc 1, %bb.3, implicit killed undef $nzcv
-
-  bb.5:
-    successors: %bb.7(0x04000000), %bb.6(0x7c000000); %bb.7(3.12%), %bb.6(96.88%)
-    %5:gpr64common = PHI %7:gpr64all, %bb.2, undef %0:gpr64all, %bb.3, undef %0:gpr64all, %bb.4
-    CBZX undef %5:gpr64common, %bb.7
-    B %bb.6
-
-  bb.6:
-    successors: %bb.7(0x04000000), %bb.1(0x7c000000); %bb.7(3.12%), %bb.1(96.88%)
-    dead $wzr = SUBSWrr killed undef %3:gpr32, killed undef %2:gpr32common, implicit-def $nzcv
-    Bcc 12, %bb.7, implicit killed undef $nzcv
-    B %bb.1
-
-  bb.7:
-    RET_ReallyLR
-...

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
index 327d0749c7dbf..7542e9c4b8f5b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
@@ -35,10 +35,10 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
 ; CHECK-LABEL: check_deinterleaving_has_deinterleave:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    add x8, x0, #16
 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    mov w9, #32 // =0x20
 ; CHECK-NEXT:    movi v4.2d, #0000000000000000
 ; CHECK-NEXT:    movi v5.2d, #0000000000000000
@@ -64,16 +64,16 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
 ; CHECK-NEXT:    ushll v24.4s, v18.4h, #0
 ; CHECK-NEXT:    ushll2 v18.4s, v18.8h, #0
 ; CHECK-NEXT:    ushll v20.4s, v20.4h, #0
-; CHECK-NEXT:    and v21.16b, v21.16b, v2.16b
-; CHECK-NEXT:    and v19.16b, v19.16b, v2.16b
-; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
-; CHECK-NEXT:    and v17.16b, v17.16b, v2.16b
-; CHECK-NEXT:    and v23.16b, v23.16b, v2.16b
-; CHECK-NEXT:    and v24.16b, v24.16b, v2.16b
-; CHECK-NEXT:    and v18.16b, v18.16b, v2.16b
-; CHECK-NEXT:    and v20.16b, v20.16b, v2.16b
+; CHECK-NEXT:    and v21.16b, v21.16b, v1.16b
+; CHECK-NEXT:    and v19.16b, v19.16b, v1.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v1.16b
+; CHECK-NEXT:    and v17.16b, v17.16b, v1.16b
+; CHECK-NEXT:    and v23.16b, v23.16b, v1.16b
+; CHECK-NEXT:    and v24.16b, v24.16b, v1.16b
+; CHECK-NEXT:    and v18.16b, v18.16b, v1.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v1.16b
 ; CHECK-NEXT:    add v4.4s, v4.4s, v19.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v21.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v21.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v22.4s
 ; CHECK-NEXT:    add v3.4s, v3.4s, v17.4s
 ; CHECK-NEXT:    add v16.4s, v16.4s, v23.4s
@@ -82,12 +82,12 @@ define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
 ; CHECK-NEXT:    add v7.4s, v7.4s, v18.4s
 ; CHECK-NEXT:    b.ne .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %middle.block
-; CHECK-NEXT:    add v2.4s, v7.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v7.4s, v3.4s
 ; CHECK-NEXT:    add v3.4s, v16.4s, v4.4s
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 69df51a27d10c..880bd2904154c 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -18,8 +18,8 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov w8, #100 // =0x64
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    whilelo p1.d, xzr, x8
-; CHECK-NEXT:    mov x10, x9
-; CHECK-NEXT:    rdvl x11, #2
+; CHECK-NEXT:    rdvl x10, #2
+; CHECK-NEXT:    mov x11, x9
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
@@ -33,16 +33,16 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x1, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x0]
 ; CHECK-NEXT:    ld1d { z5.d }, p1/z, [x1]
-; CHECK-NEXT:    add x1, x1, x11
-; CHECK-NEXT:    add x0, x0, x11
+; CHECK-NEXT:    add x1, x1, x10
+; CHECK-NEXT:    add x0, x0, x10
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #90
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
 ; CHECK-NEXT:    mov z0.d, p2/m, z7.d
 ; CHECK-NEXT:    mov z1.d, p1/m, z6.d
-; CHECK-NEXT:    whilelo p1.d, x10, x8
-; CHECK-NEXT:    add x10, x10, x9
+; CHECK-NEXT:    whilelo p1.d, x11, x8
+; CHECK-NEXT:    add x11, x11, x9
 ; CHECK-NEXT:    b.mi .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
 ; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d
@@ -217,10 +217,10 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-NEXT:    mov w8, #100 // =0x64
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    whilelo p1.d, xzr, x8
-; CHECK-NEXT:    mov x10, x9
-; CHECK-NEXT:    rdvl x11, #2
+; CHECK-NEXT:    rdvl x10, #2
+; CHECK-NEXT:    cnth x11
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    cnth x12
+; CHECK-NEXT:    mov x12, x9
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
@@ -228,7 +228,7 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-NEXT:    ld1w { z2.d }, p1/z, [x2]
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    mov z7.d, z0.d
-; CHECK-NEXT:    add x2, x2, x12
+; CHECK-NEXT:    add x2, x2, x11
 ; CHECK-NEXT:    and z2.d, z2.d, #0xffffffff
 ; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
 ; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
@@ -237,16 +237,16 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x1, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x0]
 ; CHECK-NEXT:    ld1d { z5.d }, p1/z, [x1]
-; CHECK-NEXT:    add x1, x1, x11
-; CHECK-NEXT:    add x0, x0, x11
+; CHECK-NEXT:    add x1, x1, x10
+; CHECK-NEXT:    add x0, x0, x10
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #90
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
 ; CHECK-NEXT:    mov z0.d, p2/m, z7.d
 ; CHECK-NEXT:    mov z1.d, p1/m, z6.d
-; CHECK-NEXT:    whilelo p1.d, x10, x8
-; CHECK-NEXT:    add x10, x10, x9
+; CHECK-NEXT:    whilelo p1.d, x12, x8
+; CHECK-NEXT:    add x12, x12, x9
 ; CHECK-NEXT:    b.mi .LBB2_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
 ; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
index c977869d2ce95..aed3072bb4af3 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
@@ -25,14 +25,14 @@ define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldp q3, q2, [x9]
 ; CHECK-NEXT:    cmp x8, #1600
 ; CHECK-NEXT:    ldp q5, q4, [x10]
-; CHECK-NEXT:    fcmla v1.2d, v5.2d, v3.2d, #0
-; CHECK-NEXT:    fcmla v0.2d, v4.2d, v2.2d, #0
-; CHECK-NEXT:    fcmla v1.2d, v5.2d, v3.2d, #90
-; CHECK-NEXT:    fcmla v0.2d, v4.2d, v2.2d, #90
+; CHECK-NEXT:    fcmla v0.2d, v5.2d, v3.2d, #0
+; CHECK-NEXT:    fcmla v1.2d, v4.2d, v2.2d, #0
+; CHECK-NEXT:    fcmla v0.2d, v5.2d, v3.2d, #90
+; CHECK-NEXT:    fcmla v1.2d, v4.2d, v2.2d, #90
 ; CHECK-NEXT:    b.ne .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %middle.block
-; CHECK-NEXT:    zip2 v2.2d, v1.2d, v0.2d
-; CHECK-NEXT:    zip1 v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    zip2 v2.2d, v0.2d, v1.2d
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    faddp d0, v0.2d
 ; CHECK-NEXT:    faddp d1, v2.2d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/phi.ll b/llvm/test/CodeGen/AArch64/phi.ll
index 446c3beeff976..02842c04bf7bf 100644
--- a/llvm/test/CodeGen/AArch64/phi.ll
+++ b/llvm/test/CodeGen/AArch64/phi.ll
@@ -131,8 +131,8 @@ define i128 @ti128(i1 %c, ptr %p, i128 %a, i128 %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB4_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov x5, x3
 ; CHECK-SD-NEXT:    mov x4, x2
+; CHECK-SD-NEXT:    mov x5, x3
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB4_2: // %e
 ; CHECK-SD-NEXT:    mov x0, x4
@@ -336,9 +336,9 @@ define <3 x i8> @tv3i8(i1 %c, ptr %p, <3 x i8> %a, <3 x i8> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB11_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov w7, w4
-; CHECK-SD-NEXT:    mov w6, w3
 ; CHECK-SD-NEXT:    mov w5, w2
+; CHECK-SD-NEXT:    mov w6, w3
+; CHECK-SD-NEXT:    mov w7, w4
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB11_2: // %e
 ; CHECK-SD-NEXT:    mov w0, w5
@@ -454,8 +454,8 @@ define <32 x i8> @tv32i8(i1 %c, ptr %p, <32 x i8> %a, <32 x i8> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB15_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB15_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
@@ -584,8 +584,8 @@ define <16 x i16> @tv16i16(i1 %c, ptr %p, <16 x i16> %a, <16 x i16> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB20_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB20_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
@@ -679,8 +679,8 @@ define <8 x i32> @tv8i32(i1 %c, ptr %p, <8 x i32> %a, <8 x i32> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB24_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB24_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
@@ -734,10 +734,10 @@ define <3 x i64> @tv3i64(i1 %c, ptr %p, <3 x i64> %a, <3 x i64> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB26_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    fmov d5, d2
+; CHECK-SD-NEXT:    fmov d3, d0
 ; CHECK-SD-NEXT:    fmov d4, d1
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:    fmov d3, d0
+; CHECK-SD-NEXT:    fmov d5, d2
 ; CHECK-SD-NEXT:  .LBB26_2: // %e
 ; CHECK-SD-NEXT:    fmov d0, d3
 ; CHECK-SD-NEXT:    fmov d1, d4
@@ -783,8 +783,8 @@ define <4 x i64> @tv4i64(i1 %c, ptr %p, <4 x i64> %a, <4 x i64> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB27_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB27_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
@@ -818,8 +818,8 @@ define <2 x i128> @tv2i128(i1 %c, ptr %p, <2 x i128> %a, <2 x i128> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB28_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov x7, x3
 ; CHECK-SD-NEXT:    mov x6, x2
+; CHECK-SD-NEXT:    mov x7, x3
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:    b .LBB28_3
 ; CHECK-SD-NEXT:  .LBB28_2:
@@ -883,10 +883,10 @@ define <3 x ptr> @tv3p0(i1 %c, ptr %p, <3 x ptr> %a, <3 x ptr> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB30_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    fmov d5, d2
+; CHECK-SD-NEXT:    fmov d3, d0
 ; CHECK-SD-NEXT:    fmov d4, d1
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:    fmov d3, d0
+; CHECK-SD-NEXT:    fmov d5, d2
 ; CHECK-SD-NEXT:  .LBB30_2: // %e
 ; CHECK-SD-NEXT:    fmov d0, d3
 ; CHECK-SD-NEXT:    fmov d1, d4
@@ -932,8 +932,8 @@ define <4 x ptr> @tv4p0(i1 %c, ptr %p, <4 x ptr> %a, <4 x ptr> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB31_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB31_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
@@ -1047,8 +1047,8 @@ define <16 x half> @tv16f16(i1 %c, ptr %p, <16 x half> %a, <16 x half> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB36_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB36_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
@@ -1142,8 +1142,8 @@ define <8 x float> @tv8f32(i1 %c, ptr %p, <8 x float> %a, <8 x float> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB40_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB40_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
@@ -1197,10 +1197,10 @@ define <3 x double> @tv3f64(i1 %c, ptr %p, <3 x double> %a, <3 x double> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB42_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    fmov d5, d2
+; CHECK-SD-NEXT:    fmov d3, d0
 ; CHECK-SD-NEXT:    fmov d4, d1
 ; CHECK-SD-NEXT:    str wzr, [x1]
-; CHECK-SD-NEXT:    fmov d3, d0
+; CHECK-SD-NEXT:    fmov d5, d2
 ; CHECK-SD-NEXT:  .LBB42_2: // %e
 ; CHECK-SD-NEXT:    fmov d0, d3
 ; CHECK-SD-NEXT:    fmov d1, d4
@@ -1246,8 +1246,8 @@ define <4 x double> @tv4f64(i1 %c, ptr %p, <4 x double> %a, <4 x double> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB43_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB43_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
@@ -1281,8 +1281,8 @@ define <2 x fp128> @tv2f128(i1 %c, ptr %p, <2 x fp128> %a, <2 x fp128> %b) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    tbz w0, #0, .LBB44_2
 ; CHECK-SD-NEXT:  // %bb.1: // %t
-; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v2.16b, v0.16b
+; CHECK-SD-NEXT:    mov v3.16b, v1.16b
 ; CHECK-SD-NEXT:    str wzr, [x1]
 ; CHECK-SD-NEXT:  .LBB44_2: // %e
 ; CHECK-SD-NEXT:    mov v0.16b, v2.16b
@@ -1296,8 +1296,8 @@ define <2 x fp128> @tv2f128(i1 %c, ptr %p, <2 x fp128> %a, <2 x fp128> %b) {
 ; CHECK-GI-NEXT:    mov d4, v1.d[1]
 ; CHECK-GI-NEXT:    mov d5, v0.d[1]
 ; CHECK-GI-NEXT:    str wzr, [x1]
-; CHECK-GI-NEXT:    fmov d3, d1
 ; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    fmov d3, d1
 ; CHECK-GI-NEXT:    b .LBB44_3
 ; CHECK-GI-NEXT:  .LBB44_2:
 ; CHECK-GI-NEXT:    mov d4, v3.d[1]

diff  --git a/llvm/test/CodeGen/AArch64/pr48188.ll b/llvm/test/CodeGen/AArch64/pr48188.ll
index 634517b099f98..d01069696572e 100644
--- a/llvm/test/CodeGen/AArch64/pr48188.ll
+++ b/llvm/test/CodeGen/AArch64/pr48188.ll
@@ -14,17 +14,17 @@ define void @test() nounwind {
 ; GISEL-NEXT:    b .LBB0_1
 ; GISEL-NEXT:  .LBB0_1: // %loop
 ; GISEL-NEXT:    // =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    ldr x8, [sp] // 8-byte Folded Reload
-; GISEL-NEXT:    ldr x9, [sp, #8] // 8-byte Folded Reload
-; GISEL-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; GISEL-NEXT:    str x8, [sp] // 8-byte Folded Spill
+; GISEL-NEXT:    ldr x8, [sp, #8] // 8-byte Folded Reload
+; GISEL-NEXT:    ldr x9, [sp] // 8-byte Folded Reload
+; GISEL-NEXT:    str x9, [sp] // 8-byte Folded Spill
+; GISEL-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
 ; GISEL-NEXT:    b .LBB0_1
 ;
 ; SDAG-LABEL: test:
 ; SDAG:       // %bb.0: // %entry
 ; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    mov x0, xzr
-; SDAG-NEXT:    mov x1, x0
+; SDAG-NEXT:    mov x1, xzr
+; SDAG-NEXT:    mov x0, x1
 ; SDAG-NEXT:    str x1, [sp] // 8-byte Folded Spill
 ; SDAG-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
 ; SDAG-NEXT:    b .LBB0_1

diff  --git a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
index 921cadc7a7511..5b501762418ef 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -211,27 +211,27 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    cmp w12, #2
 ; CHECK-NEXT:    b.ne LBB0_43
 ; CHECK-NEXT:  ; %bb.35: ; %while.cond130.preheader
-; CHECK-NEXT:    ldrb w12, [x9, x11]
-; CHECK-NEXT:    cbz w12, LBB0_23
+; CHECK-NEXT:    ldrb w8, [x9, x11]
+; CHECK-NEXT:    cbz w8, LBB0_23
 ; CHECK-NEXT:  ; %bb.36: ; %land.rhs134.preheader
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov x12, xzr
 ; CHECK-NEXT:    mov w0, #1 ; =0x1
 ; CHECK-NEXT:    b LBB0_38
 ; CHECK-NEXT:  LBB0_37: ; %if.then152
 ; CHECK-NEXT:    ; in Loop: Header=BB0_38 Depth=1
-; CHECK-NEXT:    add x12, x9, x8
-; CHECK-NEXT:    add x8, x8, #1
-; CHECK-NEXT:    add x12, x12, x11
-; CHECK-NEXT:    ldrb w12, [x12, #1]
-; CHECK-NEXT:    cbz w12, LBB0_43
+; CHECK-NEXT:    add x8, x9, x12
+; CHECK-NEXT:    add x12, x12, #1
+; CHECK-NEXT:    add x8, x8, x11
+; CHECK-NEXT:    ldrb w8, [x8, #1]
+; CHECK-NEXT:    cbz w8, LBB0_43
 ; CHECK-NEXT:  LBB0_38: ; %land.rhs134
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x13, x10, x8
+; CHECK-NEXT:    add x13, x10, x12
 ; CHECK-NEXT:    ldrb w13, [x13, x11]
 ; CHECK-NEXT:    cbz w13, LBB0_23
 ; CHECK-NEXT:  ; %bb.39: ; %while.body139
 ; CHECK-NEXT:    ; in Loop: Header=BB0_38 Depth=1
-; CHECK-NEXT:    cmp w12, w13
+; CHECK-NEXT:    cmp w8, w13
 ; CHECK-NEXT:    b.eq LBB0_37
 ; CHECK-NEXT:  ; %bb.40: ; %while.body139
 ; CHECK-NEXT:    ; in Loop: Header=BB0_38 Depth=1
@@ -239,7 +239,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    b.eq LBB0_37
 ; CHECK-NEXT:  ; %bb.41: ; %while.body139
 ; CHECK-NEXT:    ; in Loop: Header=BB0_38 Depth=1
-; CHECK-NEXT:    cmp w12, #94
+; CHECK-NEXT:    cmp w8, #94
 ; CHECK-NEXT:    b.eq LBB0_37
 ; CHECK-NEXT:  LBB0_42:
 ; CHECK-NEXT:    mov w0, wzr

diff  --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
index 8f1c504a7f684..c91de8f3a0a47 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
@@ -31,12 +31,14 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    .cfi_offset b14, -104
 ; CHECK-NEXT:    .cfi_offset b15, -112
 ; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    // implicit-def: $q1
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    adrp x9, B+48
 ; CHECK-NEXT:    add x9, x9, :lo12:B+48
 ; CHECK-NEXT:    adrp x10, A
 ; CHECK-NEXT:    add x10, x10, :lo12:A
 ; CHECK-NEXT:    mov x11, xzr
+; CHECK-NEXT:    // kill: killed $q1
 ; CHECK-NEXT:    // implicit-def: $q1
 ; CHECK-NEXT:    mov x12, xzr
 ; CHECK-NEXT:    // implicit-def: $q0
@@ -54,7 +56,7 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    // implicit-def: $q22
 ; CHECK-NEXT:    // implicit-def: $q23
 ; CHECK-NEXT:    // implicit-def: $q24
-; CHECK-NEXT:    // implicit-def: $q25
+; CHECK-NEXT:    // implicit-def: $q9
 ; CHECK-NEXT:    // implicit-def: $q27
 ; CHECK-NEXT:    // implicit-def: $q12
 ; CHECK-NEXT:    // implicit-def: $q28
@@ -64,97 +66,95 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    // implicit-def: $q30
 ; CHECK-NEXT:    // implicit-def: $q11
 ; CHECK-NEXT:    // implicit-def: $q31
-; CHECK-NEXT:    // kill: killed $q1
-; CHECK-NEXT:    // implicit-def: $q9
 ; CHECK-NEXT:    // implicit-def: $q13
+; CHECK-NEXT:    // kill: killed $q1
 ; CHECK-NEXT:    // implicit-def: $q1
 ; CHECK-NEXT:    // kill: killed $q1
 ; CHECK-NEXT:  .LBB0_1: // %for.cond1.preheader
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    stp q29, q15, [sp] // 32-byte Folded Spill
+; CHECK-NEXT:    ldr q15, [x8]
 ; CHECK-NEXT:    ldr x15, [x8]
+; CHECK-NEXT:    str q14, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    add x20, x10, x11
-; CHECK-NEXT:    ldr x6, [x8]
-; CHECK-NEXT:    ldr x20, [x20, #128]
-; CHECK-NEXT:    stp q25, q29, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    mov v29.16b, v21.16b
-; CHECK-NEXT:    stp q15, q14, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT:    ldr q14, [x8]
-; CHECK-NEXT:    mov v21.16b, v0.16b
-; CHECK-NEXT:    ldr q15, [x8]
 ; CHECK-NEXT:    mov v8.16b, v28.16b
-; CHECK-NEXT:    mov v28.16b, v24.16b
-; CHECK-NEXT:    fmov x13, d14
-; CHECK-NEXT:    mov x16, v14.d[1]
-; CHECK-NEXT:    mov v24.16b, v20.16b
 ; CHECK-NEXT:    fmov x2, d15
 ; CHECK-NEXT:    mov x17, v15.d[1]
+; CHECK-NEXT:    ldr q14, [x8]
+; CHECK-NEXT:    mov v28.16b, v24.16b
+; CHECK-NEXT:    mov v24.16b, v20.16b
 ; CHECK-NEXT:    mov v20.16b, v17.16b
-; CHECK-NEXT:    ldr q14, [x9], #64
+; CHECK-NEXT:    fmov x13, d14
+; CHECK-NEXT:    mov x16, v14.d[1]
 ; CHECK-NEXT:    mov v17.16b, v5.16b
-; CHECK-NEXT:    mul x18, x13, x15
+; CHECK-NEXT:    mul x3, x2, x15
+; CHECK-NEXT:    ldr q14, [x9], #64
 ; CHECK-NEXT:    ldr q5, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v25.16b, v6.16b
+; CHECK-NEXT:    ldr x6, [x8]
+; CHECK-NEXT:    ldr x20, [x20, #128]
+; CHECK-NEXT:    mul x1, x17, x15
 ; CHECK-NEXT:    mov x14, v14.d[1]
 ; CHECK-NEXT:    fmov x5, d14
+; CHECK-NEXT:    mov v29.16b, v21.16b
+; CHECK-NEXT:    mov v21.16b, v0.16b
+; CHECK-NEXT:    mov v25.16b, v6.16b
+; CHECK-NEXT:    mul x18, x13, x15
 ; CHECK-NEXT:    mov v6.16b, v2.16b
-; CHECK-NEXT:    mul x3, x2, x15
 ; CHECK-NEXT:    mov v26.16b, v22.16b
+; CHECK-NEXT:    fmov d15, x3
 ; CHECK-NEXT:    mov v22.16b, v18.16b
 ; CHECK-NEXT:    mov v18.16b, v7.16b
+; CHECK-NEXT:    mul x0, x16, x15
 ; CHECK-NEXT:    mov v7.16b, v3.16b
 ; CHECK-NEXT:    mov v16.16b, v4.16b
-; CHECK-NEXT:    mul x0, x16, x15
 ; CHECK-NEXT:    add x11, x11, #8
 ; CHECK-NEXT:    add x12, x12, #1
-; CHECK-NEXT:    fmov d14, x18
-; CHECK-NEXT:    cmp x11, #64
-; CHECK-NEXT:    mul x1, x17, x15
-; CHECK-NEXT:    fmov d15, x3
+; CHECK-NEXT:    mov v15.d[1], x1
 ; CHECK-NEXT:    mul x4, x14, x15
-; CHECK-NEXT:    mov v14.d[1], x0
+; CHECK-NEXT:    cmp x11, #64
+; CHECK-NEXT:    fmov d14, x18
 ; CHECK-NEXT:    mul x15, x5, x15
-; CHECK-NEXT:    mov v15.d[1], x1
+; CHECK-NEXT:    add v5.2d, v5.2d, v15.2d
 ; CHECK-NEXT:    mul x21, x2, x6
-; CHECK-NEXT:    add v5.2d, v5.2d, v14.2d
-; CHECK-NEXT:    add v9.2d, v9.2d, v14.2d
+; CHECK-NEXT:    mov v14.d[1], x0
 ; CHECK-NEXT:    mul x2, x2, x20
 ; CHECK-NEXT:    fmov d0, x15
-; CHECK-NEXT:    add v13.2d, v13.2d, v15.2d
-; CHECK-NEXT:    mul x22, x13, x20
 ; CHECK-NEXT:    str q5, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q5, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    mul x22, x13, x20
+; CHECK-NEXT:    add v5.2d, v5.2d, v14.2d
 ; CHECK-NEXT:    fmov d3, x21
-; CHECK-NEXT:    ldp q15, q14, [sp, #32] // 32-byte Folded Reload
 ; CHECK-NEXT:    mul x19, x17, x6
 ; CHECK-NEXT:    mov v0.d[1], x4
-; CHECK-NEXT:    mov v5.16b, v13.16b
 ; CHECK-NEXT:    fmov d1, x2
-; CHECK-NEXT:    mov v13.16b, v9.16b
-; CHECK-NEXT:    ldr q9, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    mul x17, x17, x20
+; CHECK-NEXT:    str q5, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    add v5.2d, v13.2d, v14.2d
 ; CHECK-NEXT:    fmov d2, x22
+; CHECK-NEXT:    ldr q13, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    mul x7, x16, x6
-; CHECK-NEXT:    add v9.2d, v9.2d, v0.2d
+; CHECK-NEXT:    ldp q15, q14, [sp, #16] // 32-byte Folded Reload
 ; CHECK-NEXT:    mov v3.d[1], x19
+; CHECK-NEXT:    add v13.2d, v13.2d, v0.2d
 ; CHECK-NEXT:    mul x16, x16, x20
 ; CHECK-NEXT:    mov v1.d[1], x17
-; CHECK-NEXT:    str q9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v9.16b, v13.16b
 ; CHECK-NEXT:    mul x23, x5, x20
+; CHECK-NEXT:    str q13, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov v13.16b, v5.16b
 ; CHECK-NEXT:    mov v5.16b, v17.16b
 ; CHECK-NEXT:    mov v17.16b, v20.16b
 ; CHECK-NEXT:    mov v20.16b, v24.16b
-; CHECK-NEXT:    mov v24.16b, v28.16b
 ; CHECK-NEXT:    mul x13, x13, x6
+; CHECK-NEXT:    mov v24.16b, v28.16b
 ; CHECK-NEXT:    add v11.2d, v11.2d, v3.2d
-; CHECK-NEXT:    add v27.2d, v27.2d, v3.2d
 ; CHECK-NEXT:    mov v2.d[1], x16
 ; CHECK-NEXT:    add v15.2d, v15.2d, v1.2d
-; CHECK-NEXT:    add v23.2d, v23.2d, v3.2d
+; CHECK-NEXT:    add v27.2d, v27.2d, v3.2d
 ; CHECK-NEXT:    mul x18, x14, x20
+; CHECK-NEXT:    add v23.2d, v23.2d, v3.2d
 ; CHECK-NEXT:    add v19.2d, v19.2d, v3.2d
-; CHECK-NEXT:    add v10.2d, v10.2d, v3.2d
 ; CHECK-NEXT:    fmov d4, x23
+; CHECK-NEXT:    add v10.2d, v10.2d, v3.2d
 ; CHECK-NEXT:    mul x15, x5, x6
 ; CHECK-NEXT:    fmov d0, x13
 ; CHECK-NEXT:    add v14.2d, v14.2d, v2.2d
@@ -164,7 +164,6 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    mov v7.16b, v18.16b
 ; CHECK-NEXT:    mov v4.d[1], x18
 ; CHECK-NEXT:    mov v18.16b, v22.16b
-; CHECK-NEXT:    mov v6.16b, v25.16b
 ; CHECK-NEXT:    mov v0.d[1], x7
 ; CHECK-NEXT:    fmov d1, x15
 ; CHECK-NEXT:    add v28.2d, v8.2d, v4.2d
@@ -182,36 +181,38 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    add v3.2d, v3.2d, v0.2d
 ; CHECK-NEXT:    mov v0.16b, v21.16b
 ; CHECK-NEXT:    mov v21.16b, v29.16b
-; CHECK-NEXT:    add v6.2d, v6.2d, v1.2d
-; CHECK-NEXT:    ldp q25, q29, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldr q29, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add v9.2d, v9.2d, v1.2d
+; CHECK-NEXT:    add v6.2d, v25.2d, v1.2d
 ; CHECK-NEXT:    add v5.2d, v5.2d, v1.2d
+; CHECK-NEXT:    add v29.2d, v29.2d, v1.2d
 ; CHECK-NEXT:    add v21.2d, v21.2d, v1.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    add v29.2d, v29.2d, v1.2d
-; CHECK-NEXT:    add v25.2d, v25.2d, v1.2d
 ; CHECK-NEXT:    b.ne .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
-; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    adrp x8, C
 ; CHECK-NEXT:    add x8, x8, :lo12:C
-; CHECK-NEXT:    stp q31, q11, [x8, #64]
+; CHECK-NEXT:    stp q11, q30, [x8, #80]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT:    stp q1, q13, [x8]
-; CHECK-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    str q1, [x8]
+; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x23, [sp, #160] // 8-byte Folded Reload
-; CHECK-NEXT:    stp q30, q29, [x8, #96]
-; CHECK-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT:    stp q9, q1, [x8, #32]
-; CHECK-NEXT:    ldp d9, d8, [sp, #144] // 16-byte Folded Reload
 ; CHECK-NEXT:    stp q15, q14, [x8, #144]
-; CHECK-NEXT:    ldp d15, d14, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q1, q13, [x8, #16]
+; CHECK-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    stp q28, q12, [x8, #176]
 ; CHECK-NEXT:    ldp d13, d12, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q1, q31, [x8, #48]
+; CHECK-NEXT:    ldp d15, d14, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q9, q24, [x8, #240]
+; CHECK-NEXT:    ldp d9, d8, [sp, #144] // 16-byte Folded Reload
 ; CHECK-NEXT:    stp q19, q18, [x8, #336]
 ; CHECK-NEXT:    stp q10, q7, [x8, #400]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #128] // 16-byte Folded Reload
+; CHECK-NEXT:    str q29, [x8, #112]
 ; CHECK-NEXT:    str q27, [x8, #208]
-; CHECK-NEXT:    stp q25, q24, [x8, #240]
 ; CHECK-NEXT:    stp q23, q22, [x8, #272]
 ; CHECK-NEXT:    stp q21, q20, [x8, #304]
 ; CHECK-NEXT:    stp q6, q17, [x8, #368]

diff  --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
index b3dfab8f69b59..f5df5ea53c990 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
@@ -95,18 +95,18 @@ define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov x10, xzr
-; CHECK-NEXT:    neg x9, x8
-; CHECK-NEXT:    add x9, x9, #4
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x10, lsl #3]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
 ; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    b.ne .LBB2_3
 ; CHECK-NEXT:  // %bb.2: // %vector.body
 ; CHECK-NEXT:    // in Loop: Header=BB2_1 Depth=1
-; CHECK-NEXT:    cmp x9, x10
-; CHECK-NEXT:    add x10, x10, x8
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
 ; CHECK-NEXT:    b.ne .LBB2_1
 ; CHECK-NEXT:  .LBB2_3: // %middle.split
 ; CHECK-NEXT:    ptest p0, p1.b
@@ -138,18 +138,18 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov x10, xzr
-; CHECK-NEXT:    neg x9, x8
-; CHECK-NEXT:    add x9, x9, #4
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
 ; CHECK-NEXT:  .LBB3_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x10, lsl #3]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
 ; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    b.ne .LBB3_3
 ; CHECK-NEXT:  // %bb.2: // %vector.body
 ; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
-; CHECK-NEXT:    cmp x9, x10
-; CHECK-NEXT:    add x10, x10, x8
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
 ; CHECK-NEXT:    b.ne .LBB3_1
 ; CHECK-NEXT:  .LBB3_3: // %middle.split
 ; CHECK-NEXT:    ptest p0, p1.b

diff  --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
index 163124c0d2757..4d383fefc43c7 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
@@ -151,7 +151,7 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast" {
 ; CHECK-NEXT:  .LBB4_3: // %LI
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB4_6 Depth 2
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov x21, xzr
 ; CHECK-NEXT:    add x23, x22, #1
 ; CHECK-NEXT:    b .LBB4_6
 ; CHECK-NEXT:  .LBB4_4: // %if.else
@@ -162,13 +162,13 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast" {
 ; CHECK-NEXT:    add x8, x21, #1
 ; CHECK-NEXT:    str w0, [x20, x21, lsl #2]
 ; CHECK-NEXT:    sub x9, x8, #1
+; CHECK-NEXT:    mov x21, x8
 ; CHECK-NEXT:    cmp x9, x19
 ; CHECK-NEXT:    b.ge .LBB4_2
 ; CHECK-NEXT:  .LBB4_6: // %LJ
 ; CHECK-NEXT:    // Parent Loop BB4_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    mov x21, x8
-; CHECK-NEXT:    ldr w8, [x20, x8, lsl #2]
+; CHECK-NEXT:    ldr w8, [x20, x21, lsl #2]
 ; CHECK-NEXT:    tbz w8, #31, .LBB4_4
 ; CHECK-NEXT:  // %bb.7: // %if.then
 ; CHECK-NEXT:    // in Loop: Header=BB4_6 Depth=2

diff  --git a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
index 2fe3001ec0f44..d94fa6433bb7f 100644
--- a/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lsrchain.ll
@@ -14,22 +14,22 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
 ; CHECK-NEXT:  // %bb.2: // %for.body.us.preheader
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add x11, x2, x11, lsl #1
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    mov w9, wzr
+; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    mov w10, wzr
 ; CHECK-NEXT:    mov x12, #4 // =0x4
 ; CHECK-NEXT:    mov x13, #8 // =0x8
 ; CHECK-NEXT:  .LBB0_3: // %for.body.us
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB0_4 Depth 2
-; CHECK-NEXT:    add x14, x0, x8, lsl #2
-; CHECK-NEXT:    sbfiz x15, x9, #1, #32
+; CHECK-NEXT:    add x14, x0, x9, lsl #2
+; CHECK-NEXT:    sbfiz x15, x8, #1, #32
 ; CHECK-NEXT:    mov x16, x2
 ; CHECK-NEXT:    ldp s0, s1, [x14]
 ; CHECK-NEXT:    add x15, x15, #8
 ; CHECK-NEXT:    ldp s2, s3, [x14, #8]
-; CHECK-NEXT:    ubfiz x14, x9, #1, #32
+; CHECK-NEXT:    ubfiz x14, x8, #1, #32
 ; CHECK-NEXT:    fcvt h0, s0
 ; CHECK-NEXT:    fcvt h1, s1
 ; CHECK-NEXT:    fcvt h2, s2
@@ -91,8 +91,8 @@ define void @test(ptr nocapture noundef readonly %kernel, i32 noundef %kw, float
 ; CHECK-NEXT:  // %bb.5: // %while.cond.i..exit_crit_edge.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    add w10, w10, #1
-; CHECK-NEXT:    add x8, x8, #4
-; CHECK-NEXT:    add w9, w9, #16
+; CHECK-NEXT:    add x9, x9, #4
+; CHECK-NEXT:    add w8, w8, #16
 ; CHECK-NEXT:    cmp w10, w1
 ; CHECK-NEXT:    b.ne .LBB0_3
 ; CHECK-NEXT:  .LBB0_6: // %exit78

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
index 39fe92aae0619..124f81e7864d1 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
@@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
 ; CHECK-NEXT:    whilelt p0.s, wzr, w0
 ; CHECK-NEXT:    b.pl .LBB0_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w9, wzr
-; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    cntw x9
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    whilelt p0.s, w9, w0
-; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    whilelt p0.s, w8, w0
+; CHECK-NEXT:    add w8, w8, w9
 ; CHECK-NEXT:    b.mi .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: // %exit
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll
index 9949a48034815..07ee87e880aff 100644
--- a/llvm/test/CodeGen/AArch64/swifterror.ll
+++ b/llvm/test/CodeGen/AArch64/swifterror.ll
@@ -564,11 +564,10 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-O0-AARCH64-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-AARCH64-NEXT:    mov w0, w8
 ; CHECK-O0-AARCH64-NEXT:    bl _malloc
-; CHECK-O0-AARCH64-NEXT:    mov x1, x0
-; CHECK-O0-AARCH64-NEXT:    mov x0, x1
-; CHECK-O0-AARCH64-NEXT:    str x1, [sp, #8] ; 8-byte Folded Spill
+; CHECK-O0-AARCH64-NEXT:    mov x9, x0
 ; CHECK-O0-AARCH64-NEXT:    mov w8, #1 ; =0x1
-; CHECK-O0-AARCH64-NEXT:    strb w8, [x0, #8]
+; CHECK-O0-AARCH64-NEXT:    strb w8, [x9, #8]
+; CHECK-O0-AARCH64-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
 ; CHECK-O0-AARCH64-NEXT:  LBB4_3: ; %bb_cont
 ; CHECK-O0-AARCH64-NEXT:    ; in Loop: Header=BB4_1 Depth=1
 ; CHECK-O0-AARCH64-NEXT:    ldr s0, [sp, #16] ; 4-byte Folded Reload
@@ -606,11 +605,12 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-O0-ARM64_32-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-ARM64_32-NEXT:    mov w0, w8
 ; CHECK-O0-ARM64_32-NEXT:    bl _malloc
-; CHECK-O0-ARM64_32-NEXT:    ; kill: def $x1 killed $x0
-; CHECK-O0-ARM64_32-NEXT:    mov x1, x0
-; CHECK-O0-ARM64_32-NEXT:    str x1, [sp, #8] ; 8-byte Folded Spill
+; CHECK-O0-ARM64_32-NEXT:    mov x9, x0
+; CHECK-O0-ARM64_32-NEXT:    ; kill: def $x0 killed $x9
+; CHECK-O0-ARM64_32-NEXT:    mov x0, x9
 ; CHECK-O0-ARM64_32-NEXT:    mov w8, #1 ; =0x1
-; CHECK-O0-ARM64_32-NEXT:    strb w8, [x0, #8]
+; CHECK-O0-ARM64_32-NEXT:    strb w8, [x9, #8]
+; CHECK-O0-ARM64_32-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:  LBB4_3: ; %bb_cont
 ; CHECK-O0-ARM64_32-NEXT:    ; in Loop: Header=BB4_1 Depth=1
 ; CHECK-O0-ARM64_32-NEXT:    ldr s0, [sp, #16] ; 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 11afcebfada2e..666523c88860c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -1670,23 +1670,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s16
+; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_max_f32_e32 v2, v5, v5
-; GFX942-NEXT:    v_max_f32_e32 v4, v2, v0
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1718,22 +1718,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX90A-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_max_f32_e32 v2, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1743,23 +1743,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, s20
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_max_f32_e32 v1, v5, v5
-; GFX908-NEXT:    v_max_f32_e32 v4, v1, v0
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1769,23 +1769,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s20
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v5
-; GFX8-NEXT:    v_max_f32_e32 v4, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1986,24 +1986,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s16
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], null offen
 ; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[9:10], v[9:10]
-; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[2:3], v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2027,25 +2026,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s16
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
 ; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX11-NEXT:    v_max_f64 v[7:8], v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2079,24 +2077,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
-; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX908-NEXT:    v_max_f64 v[7:8], v[2:3], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
-; GFX908-NEXT:    v_mov_b32_e32 v4, v9
-; GFX908-NEXT:    v_mov_b32_e32 v5, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2108,24 +2106,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
-; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX8-NEXT:    v_max_f64 v[7:8], v[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
-; GFX8-NEXT:    v_mov_b32_e32 v4, v9
-; GFX8-NEXT:    v_mov_b32_e32 v5, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 404da6a8a1ef7..351502816ae6e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -1670,23 +1670,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s16
+; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_max_f32_e32 v2, v5, v5
-; GFX942-NEXT:    v_min_f32_e32 v4, v2, v0
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1718,22 +1718,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX90A-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_max_f32_e32 v2, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v2, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1743,23 +1743,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v3, s20
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_max_f32_e32 v1, v5, v5
-; GFX908-NEXT:    v_min_f32_e32 v4, v1, v0
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1769,23 +1769,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s20
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v5
-; GFX8-NEXT:    v_min_f32_e32 v4, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1986,24 +1986,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s16
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], null offen
 ; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[9:10], v[9:10]
-; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[2:3], v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2027,25 +2026,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s16
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
 ; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX11-NEXT:    v_min_f64 v[7:8], v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2079,24 +2077,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
-; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX908-NEXT:    v_min_f64 v[7:8], v[2:3], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
-; GFX908-NEXT:    v_mov_b32_e32 v4, v9
-; GFX908-NEXT:    v_mov_b32_e32 v5, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2108,24 +2106,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
-; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX8-NEXT:    v_min_f64 v[7:8], v[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
-; GFX8-NEXT:    v_mov_b32_e32 v4, v9
-; GFX8-NEXT:    v_mov_b32_e32 v5, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index 350f9ad882ed0..0da25260e2317 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -387,11 +387,11 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB5_2 Depth 2
 ; GFX10-NEXT:    s_ashr_i32 s7, s6, 31
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s4, s8
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], 2
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s10
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s11
-; GFX10-NEXT:    s_mov_b32 s10, s8
+; GFX10-NEXT:    s_mov_b32 s10, 0
 ; GFX10-NEXT:    s_mov_b32 s11, 0
 ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v2, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo
@@ -402,18 +402,18 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v8, s11
 ; GFX10-NEXT:    s_mov_b32 s12, exec_lo
 ; GFX10-NEXT:    s_add_i32 s11, s11, 1
-; GFX10-NEXT:    s_xor_b32 s10, s10, s12
+; GFX10-NEXT:    s_xor_b32 s4, s4, s12
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v8, v0
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_or_b32 s10, vcc_lo, s10
 ; GFX10-NEXT:    s_andn2_b32 s9, s9, exec_lo
-; GFX10-NEXT:    s_and_b32 s12, exec_lo, s10
+; GFX10-NEXT:    s_and_b32 s12, exec_lo, s4
 ; GFX10-NEXT:    s_or_b32 s9, s9, s12
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s10
 ; GFX10-NEXT:    s_cbranch_execnz .LBB5_2
 ; GFX10-NEXT:  ; %bb.3: ; %UseInst
 ; GFX10-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s10
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s7
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s6
 ; GFX10-NEXT:    v_cmp_lt_u32_e32 vcc_lo, s6, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 67c4b349f9502..f729de82cb042 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -10475,7 +10475,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v16, s37, 5
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v17, s101, 5
-; GFX11-TRUE16-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s57, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
@@ -10653,7 +10653,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s42, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s18, 8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s57
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB13_3
 ; GFX11-TRUE16-NEXT:  .LBB13_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s5, s5, 3
@@ -11321,9 +11321,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB13_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 0
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 0
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
@@ -11381,72 +11381,72 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr72_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 13
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 15
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 18
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 19
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 20
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 21
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 22
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s58, 0
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s58, 1
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 1
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 2
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 4
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 5
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 6
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 7
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 8
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 9
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 10
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 11
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 12
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 13
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 14
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 15
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 17
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 18
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 19
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 20
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 21
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 22
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 23
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 24
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 25
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 26
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 27
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 28
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 29
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 30
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 31
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s57, 0
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s57, 1
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s100, 2
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s101, 3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr100_lo16
@@ -11498,7 +11498,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v16, s37, 5
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s101, 5
-; GFX11-FAKE16-NEXT:    s_mov_b32 s44, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s101, 0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
@@ -11531,497 +11531,676 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v16, s85, 29
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v16, s86, 30
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v16, s87, 31
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB13_4
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB13_2
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[4:5], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 4
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s29, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s28, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 vcc_hi, s27, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s34, s27, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s35, s27, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s4, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s36, s26, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s26, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s25, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s4, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s25, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s25, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s24, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s50, s24, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s23, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s23, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 9
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s23, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s22, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s22, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 10
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s21, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s21, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s21, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 11
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s6, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s20, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s20, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s19, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 12
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s6, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s19, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s71, s19, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s18, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 13
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s18, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s17, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s17, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 14
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s17, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s16, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s16, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 15
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s3, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s96, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s3, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s8, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s2, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s25, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[4:5], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s5, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s5, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s5, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 17
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s24, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s57, s4, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s47, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s7, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 18
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s23, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 vcc_hi, s7, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s34, s7, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s6, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 19
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s56, s6, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s35, s9, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s36, s9, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 20
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s23, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s9, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s8, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s8, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 21
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s11, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s11, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s50, s11, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 22
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s22, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s10, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s10, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s13, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 23
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s21, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s13, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s13, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s12, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s12, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s15, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s15, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 25
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s21, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s15, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s14, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s14, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 26
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s41, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s71, s41, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s41, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 27
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s20, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s40, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s40, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s29, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 28
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s19, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s29, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s28, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 29
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s28, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s27, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s72, s27, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 30
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s19, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s27, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s73, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s96, s26, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 31
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s25, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s18, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 14
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s74, s2, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s17, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 15
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[6:7], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s98, s1, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 12
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s1, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 17
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s8, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s100, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s101, s0, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 18
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[56:57], s[10:11], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[58:59], s[12:13], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[60:61], s[14:15], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 19
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[40:41], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[72:73], s[28:29], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s17, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 13
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[8:9], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s16, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 10
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[76:77], s[26:27], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 20
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 5
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s16, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 11
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[10:11], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[88:89], s[24:25], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[74:75], s[22:23], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 6
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s3, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 21
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s10, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[90:91], s[18:19], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 7
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s3, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 9
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[12:13], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[92:93], s[16:17], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s3, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 6
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[2:3], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 22
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s10, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[0:1], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 23
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 25
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 26
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s12, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 27
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s12, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 28
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 29
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 30
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 31
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s14, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 9
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s2, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 7
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[14:15], 24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 5
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[40:41], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[28:29], 24
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 0
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s14, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s46, 2
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s47, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 2
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s46, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s47, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 4
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s40, 16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s40, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s29, 24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s29, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s44
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB13_3
-; GFX11-FAKE16-NEXT:  .LBB13_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[22:23], 24
+; GFX11-FAKE16-NEXT:    s_branch .LBB13_3
+; GFX11-FAKE16-NEXT:  .LBB13_2:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_hi
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    s_mov_b32 s101, -1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 0
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 2
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr94
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr92
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr90
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr62
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr104
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr88
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr76
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 10
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 12
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 14
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 15
+; GFX11-FAKE16-NEXT:  .LBB13_3: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s101
+; GFX11-FAKE16-NEXT:    s_mov_b32 s101, s104
+; GFX11-FAKE16-NEXT:    s_mov_b32 s104, s57
+; GFX11-FAKE16-NEXT:    s_mov_b32 s57, s69
+; GFX11-FAKE16-NEXT:    s_mov_b32 s69, s42
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB13_5
+; GFX11-FAKE16-NEXT:  ; %bb.4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_add_i32 s25, s25, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s25, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s23, s23, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s24, 16
+; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s21, s21, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 17
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s24, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s19, s19, 3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s5, s5, 3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s4, s4, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 18
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s23, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[4:5], 24
 ; GFX11-FAKE16-NEXT:    s_add_i32 s7, s7, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 4
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 16
 ; GFX11-FAKE16-NEXT:    s_add_i32 s6, s6, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 19
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s23, 16
 ; GFX11-FAKE16-NEXT:    s_add_i32 s9, s9, 3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s8, s8, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 8
 ; GFX11-FAKE16-NEXT:    s_add_i32 s11, s11, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 20
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s23, 8
 ; GFX11-FAKE16-NEXT:    s_add_i32 s10, s10, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s13, s13, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s4, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 21
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s22, 16
 ; GFX11-FAKE16-NEXT:    s_add_i32 s12, s12, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s17, s17, 3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s15, s15, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 22
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s22, 8
 ; GFX11-FAKE16-NEXT:    s_add_i32 s14, s14, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s4, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s41, s41, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 23
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s21, 24
 ; GFX11-FAKE16-NEXT:    s_add_i32 s40, s40, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[4:5], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 24
+; GFX11-FAKE16-NEXT:    s_add_i32 s3, s3, 3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s29, s29, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s21, 16
+; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s1, s1, 3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 9
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s3, s3, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 25
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s21, 8
 ; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s17, s17, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 10
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s19, s19, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 11
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s6, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s21, s21, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s23, s23, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 12
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s6, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s25, s25, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 13
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 24
 ; GFX11-FAKE16-NEXT:    s_add_i32 s27, s27, 3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 14
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s29, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s28, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 15
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 vcc_hi, s27, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s34, s27, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s35, s27, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s8, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s36, s26, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s26, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s25, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 17
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s8, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s25, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s25, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s24, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 18
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s50, s24, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s23, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s23, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 19
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s23, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s22, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s22, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 20
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s21, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s21, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s21, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 21
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s10, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s20, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s20, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s19, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 22
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s10, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s19, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s71, s19, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s18, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 23
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s18, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s17, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s17, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s17, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s16, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s16, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 25
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s3, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s96, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s3, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 26
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s12, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s2, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s98, s1, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s101, s5, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s5, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s5, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 27
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s12, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s100, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s101, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s20, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s4, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s47, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s7, 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 28
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[56:57], s[10:11], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[58:59], s[12:13], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[60:61], s[14:15], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s19, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 vcc_hi, s7, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s34, s7, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s57, s6, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 29
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[40:41], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[72:73], s[28:29], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[76:77], s[26:27], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s56, s6, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s35, s9, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s36, s9, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 30
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[88:89], s[24:25], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[74:75], s[22:23], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s19, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s9, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s8, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s8, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 31
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s14, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[90:91], s[18:19], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s11, 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 0
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s14, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s46, 2
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[92:93], s[16:17], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[2:3], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s18, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 14
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s11, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s50, s11, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s47, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[0:1], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s17, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 15
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[6:7], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s10, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 2
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s46, 0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s17, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 12
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s10, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s13, 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s47, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s17, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 13
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[8:9], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s13, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 4
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s40, 16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s16, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 10
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s13, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s12, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s40, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s16, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 11
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[10:11], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s12, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s29, 24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s3, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s15, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s15, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s3, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 9
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[12:13], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s15, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s2, 16
-; GFX11-FAKE16-NEXT:  .LBB13_3: ; %end
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s101, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s45, s45, 0xff
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s44
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s30, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s14, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 6
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s14, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s41, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s71, s41, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s41, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 7
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[14:15], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s40, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s40, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s29, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s29, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s28, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 5
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[40:41], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s27, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s72, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s27, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s73, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s96, s26, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s25, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s25, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[28:29], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s3, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s74, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s2, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s98, s1, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s100, s1, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[76:77], s[26:27], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[88:89], s[24:25], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[22:23], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[90:91], s[18:19], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[92:93], s[16:17], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[2:3], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[0:1], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 9
+; GFX11-FAKE16-NEXT:  .LBB13_5: ; %end
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s43, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s44, s45, s44
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s44, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s42, s74, 0xff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s43
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s94, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s42, s42, 0xff
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_or_b32 s42, s42, s43
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s45, s45, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s42, s42, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s42
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s42, v18, 9
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s45, s30, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s44, 0xff
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX11-FAKE16-NEXT:    s_or_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s42, s42, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s44, 16
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s42
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s42, v18, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s43, v18, 7
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s44
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s100, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s42, s42, s43
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s45, s98, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s44
 ; GFX11-FAKE16-NEXT:    s_and_b32 s44, s99, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s45, s98, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s42, s42, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s42, s42, 0xff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s44, s44, s45
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s42
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s42, s97, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s43, 8
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s44, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s42
-; GFX11-FAKE16-NEXT:    s_and_b32 s42, s96, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s87, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s44
 ; GFX11-FAKE16-NEXT:    s_or_b32 s42, s42, s43
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s44
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s42, s42, 16
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s86, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v18, 6
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s42
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v18, 5
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v18, 2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s92, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s85, 0xff
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s84, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v18, 4
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s82, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s17, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s80, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v18, 0
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s83, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v18, 3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v19, 29
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s100, v17, 4
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s99, v17, 3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s18, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s69, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s81, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v18, 1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v19, 28
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s18, v19, 19
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s90, 8
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s3
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s16, s71, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v19, 31
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s68, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s19, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s16, s16, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s16
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s70, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v19, 30
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s78, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s18, s18, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s19, s86, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s16, s16, s17
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v19, 21
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s16, s16, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s78, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s98, v17, 2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s16
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s67, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 27
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 24
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v19, 22
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 8
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s66, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 26
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s64, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s21, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s54, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s86, v16, 30
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s65, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 25
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v16, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s22, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s53, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s55, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s18, s51, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s1, v19, 18
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s74, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s62, 8
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s23, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s16, s16, s17
-; GFX11-FAKE16-NEXT:    s_and_b32 s17, s52, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v19, 20
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT:    s_or_b32 s17, s17, s18
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s16, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s17, s17, 0xff
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_or_b32 s17, s17, s18
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s16, s17, 16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s97, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s16
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s50, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s49, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 17
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s88, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s69, 0xff
+; GFX11-FAKE16-NEXT:    s_and_b32 s18, s72, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s97, v17, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s69, v16, 21
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s25, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s48, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s39, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s38, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s73, 0xff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s37, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s36, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s96, 8
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s76, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
 ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s27, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s35, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s18, s34, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s19, vcc_hi, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s87, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s16, s16, s17
 ; GFX11-FAKE16-NEXT:    s_or_b32 s17, s18, s19
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v18, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v18, 7
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v19, 0
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s104, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s103, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s72, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s85, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s84, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s16, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v19, 1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s29, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s102, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s83, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s82, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s81, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s18, v19, 2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
@@ -12030,176 +12209,147 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v18, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v18, 5
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s40, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s62, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s18, v18, 3
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s61, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s80, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s18, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s19, v19, 3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v18, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s19, v18, 2
 ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s41, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s18, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s19, s19, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s60, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s18, s71, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s19, s70, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s16, s16, s17
 ; GFX11-FAKE16-NEXT:    s_or_b32 s17, s18, s19
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v19, 4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s1, v18, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v18, 0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s14, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s60, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s14, v19, 30
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s58, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s59, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s16, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s15, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 31
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s15, v19, 29
-; GFX11-FAKE16-NEXT:    s_and_b32 s14, s14, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s15, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s68, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s14, s67, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s66, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s14, s15
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s14, v19, 6
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s12, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 28
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s12, v19, 27
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s14, s58, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s15, v19, 24
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s12, s12, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s65, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s12, s64, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s15, v19, 7
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s12, s14
 ; GFX11-FAKE16-NEXT:    s_and_b32 s12, s13, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s13, v19, 26
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s14, v19, 25
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s15, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s14, s14, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s13, s55, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s14, s54, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s53, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s12, s12, s13
 ; GFX11-FAKE16-NEXT:    s_or_b32 s13, s14, s15
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s13, s13, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s12, s13
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s12, v19, 8
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s1, v19, 23
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 22
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s10, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s56, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s10, v19, 20
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s52, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s51, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s12, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s11, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 21
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s11, v19, 19
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s10, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s50, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s10, s49, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s48, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s10, s11
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s10, v19, 10
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s8, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 18
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s8, v19, 17
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s46, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s11, v19, 14
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s39, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s38, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s11, v19, 11
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s8, s10
 ; GFX11-FAKE16-NEXT:    s_and_b32 s8, s9, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s9, v19, 16
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s10, v19, 15
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s10, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s37, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s10, s36, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s35, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s8, s8, s9
 ; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s8, s9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s8, v19, 12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s1, v19, 13
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 12
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s8, v19, 0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s6, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s6, v19, 10
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s56, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s57, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s8, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s7, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 11
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s7, v19, 9
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s34, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, vcc_hi, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s46, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s6, s7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s6, v19, 14
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s4, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s4, v19, 7
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s6, v19, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s7, v19, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s7, v19, 4
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s47, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s104, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s7, v19, 15
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s6
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s5, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s5, v19, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s6, v19, 5
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s103, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s102, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s101, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s4, s4, s5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
@@ -12207,7 +12357,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:64
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s9, v19, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v19, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s13, v19, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s9, v19, 13
 ; GFX11-FAKE16-NEXT:    s_clause 0x2
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:96
@@ -12216,13 +12368,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s103, v17, 7
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s102, v17, 6
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s101, v17, 5
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s100, v17, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s99, v17, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s98, v17, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s97, v17, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s96, v17, 0
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s87, v16, 31
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s86, v16, 30
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s85, v16, 29
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s84, v16, 28
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s83, v16, 27
@@ -12231,7 +12378,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s80, v16, 24
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s71, v16, 23
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s70, v16, 22
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s69, v16, 21
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s68, v16, 20
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s67, v16, 19
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s66, v16, 18
@@ -12251,8 +12397,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v16, 4
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v16, 3
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v16, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v16, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v16, 0
 ; GFX11-FAKE16-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-FAKE16-NEXT:    s_clause 0x3
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v16, off, s32
@@ -12262,145 +12406,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB13_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_hi
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 0
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr101
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr30
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr100
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr99
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr98
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr94
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr97
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr96
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr87
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr86
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr85
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr92
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr84
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr83
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr82
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr81
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr80
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr90
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr71
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr70
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr69
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr68
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr67
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr78
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr66
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr65
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr64
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr55
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr54
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr74
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr53
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr52
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr51
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr50
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr49
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr48
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr39
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr38
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr37
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr36
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr35
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr34
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr104
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr103
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr102
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr88
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr76
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr72
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr62
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr60
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr58
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr56
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 1
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 2
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 3
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    s_branch .LBB13_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -19700,7 +19705,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:332
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:8
@@ -19726,7 +19731,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:168
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:176
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v15
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
@@ -19735,11 +19740,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v56, 8, v3
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v5
-; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v9
-; VI-NEXT:    v_lshlrev_b32_e32 v12, 8, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v59, 8, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -19760,7 +19765,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v41
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v40
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v55
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v54
@@ -19768,22 +19773,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v53
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v52
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v51
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v50
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v49
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v48
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v48
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v39
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v22
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v32
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
@@ -19806,24 +19811,23 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v38
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v26, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b32_e32 v23, 8, v13
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v54, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v9
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
@@ -19836,37 +19840,37 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:304
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v48, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v27, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v28, 8, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
 ; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
@@ -19874,23 +19878,23 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:100
 ; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:116
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:124
 ; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:140
 ; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:156
 ; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:164
 ; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:212
 ; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:268
 ; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:276
@@ -19903,57 +19907,57 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(12)
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB15_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -19968,12 +19972,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20002,7 +20006,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v29, v9
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -20025,15 +20031,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v50, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20043,18 +20049,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v59, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v56, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v39, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20062,7 +20068,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v38, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v37, v0
@@ -20070,8 +20076,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v36, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20083,41 +20089,39 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v33, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v51, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v34, v22
+; VI-NEXT:    v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v32, v23
+; VI-NEXT:    v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v51, v3
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v43, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v43, v49
+; VI-NEXT:    v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v32, v54
+; VI-NEXT:    v_mov_b32_e32 v34, v26
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v49, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v0
@@ -20127,26 +20131,28 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v46, v61
 ; VI-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v47, v45
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v48, v28
-; VI-NEXT:    v_mov_b32_e32 v47, v58
-; VI-NEXT:    v_mov_b32_e32 v45, v44
-; VI-NEXT:    v_mov_b32_e32 v63, v42
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v58, v44
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_mov_b32_e32 v48, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v63, v42
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20162,8 +20168,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20197,43 +20203,44 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB15_3
 ; VI-NEXT:  .LBB15_2:
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v32, v54
+; VI-NEXT:    v_mov_b32_e32 v43, v49
 ; VI-NEXT:    v_mov_b32_e32 v46, v61
-; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v34, v22
-; VI-NEXT:    v_mov_b32_e32 v32, v23
-; VI-NEXT:    v_mov_b32_e32 v47, v58
-; VI-NEXT:    v_mov_b32_e32 v45, v44
+; VI-NEXT:    v_mov_b32_e32 v47, v45
+; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v34, v26
+; VI-NEXT:    v_mov_b32_e32 v58, v44
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_mov_b32_e32 v63, v42
 ; VI-NEXT:    v_mov_b32_e32 v51, v7
-; VI-NEXT:    v_mov_b32_e32 v48, v28
+; VI-NEXT:    v_mov_b32_e32 v48, v29
 ; VI-NEXT:    s_mov_b64 s[4:5], -1
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:  .LBB15_3: ; %Flow
 ; VI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v42, v45
-; VI-NEXT:    v_mov_b32_e32 v45, v46
+; VI-NEXT:    v_mov_b32_e32 v44, v47
+; VI-NEXT:    v_mov_b32_e32 v47, v46
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_mov_b32_e32 v46, v49
 ; VI-NEXT:    s_cbranch_vccnz .LBB15_5
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
@@ -20287,7 +20294,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x300, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
@@ -20296,8 +20303,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x3000000, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x3000000, v1
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -20310,8 +20317,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x3000000, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -20386,29 +20393,29 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -20420,8 +20427,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -20433,8 +20440,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -20446,8 +20453,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -20458,8 +20465,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -20469,8 +20476,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -20481,8 +20488,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -20492,63 +20499,63 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v20, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -20558,57 +20565,54 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v45
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v47
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v47
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v44
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v42
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v58
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v63
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x3000000, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v41
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v40
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -20893,7 +20897,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:164
 ; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:172
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:212
@@ -20901,11 +20905,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:252
 ; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:260
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:292
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:308
@@ -20931,7 +20935,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
@@ -21106,7 +21110,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -21118,7 +21122,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v57
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v60
 ; GFX9-NEXT:    v_mov_b32_e32 v52, v56
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -21127,14 +21131,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -21144,12 +21148,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v51, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -21203,7 +21207,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:  .LBB15_2:
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
@@ -21565,12 +21569,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v45
-; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x300, v0
 ; GFX9-NEXT:    v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v0, 3, v44
+; GFX9-NEXT:    v_add_u32_e32 v0, 3, v40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v55
@@ -21580,7 +21584,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v43
-; GFX9-NEXT:    v_add_u32_e32 v1, 3, v42
+; GFX9-NEXT:    v_add_u32_e32 v1, 3, v36
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x300, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -21589,7 +21593,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v0, 3, v36
+; GFX9-NEXT:    v_add_u32_e32 v0, 3, v42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v41
@@ -24691,23 +24695,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    s_mov_b32 s72, s74
 ; SI-NEXT:    s_mov_b32 s73, s75
 ; SI-NEXT:    s_mov_b32 s74, s76
-; SI-NEXT:    s_mov_b32 s75, s77
-; SI-NEXT:    s_mov_b32 s76, s78
-; SI-NEXT:    s_mov_b32 s77, s79
-; SI-NEXT:    s_mov_b32 s78, s88
-; SI-NEXT:    s_mov_b32 s79, s89
-; SI-NEXT:    s_mov_b32 s88, s90
-; SI-NEXT:    s_mov_b32 s89, s91
-; SI-NEXT:    s_mov_b32 s90, s92
-; SI-NEXT:    s_mov_b32 s91, s93
-; SI-NEXT:    s_mov_b32 s92, s94
-; SI-NEXT:    s_mov_b32 s93, s95
-; SI-NEXT:    s_mov_b32 s94, s30
-; SI-NEXT:    s_mov_b32 s95, s31
-; SI-NEXT:    s_mov_b32 s30, s34
-; SI-NEXT:    s_mov_b32 s31, s35
-; SI-NEXT:    v_readlane_b32 s34, v21, 0
-; SI-NEXT:    v_readlane_b32 s35, v21, 1
+; SI-NEXT:    v_readlane_b32 s75, v21, 0
+; SI-NEXT:    v_readlane_b32 s76, v21, 1
 ; SI-NEXT:    s_cbranch_vccnz .LBB17_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s16, s16, 3
@@ -24769,22 +24758,22 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    s_lshl_b32 s62, s84, 16
 ; SI-NEXT:    s_and_b32 s73, s83, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s72, s83, 16
-; SI-NEXT:    s_and_b32 s75, s82, 0xffff0000
+; SI-NEXT:    s_and_b32 s77, s82, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s74, s82, 16
-; SI-NEXT:    s_and_b32 s77, s81, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s76, s81, 16
-; SI-NEXT:    s_and_b32 s79, s80, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s78, s80, 16
-; SI-NEXT:    s_and_b32 s89, s71, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s88, s71, 16
-; SI-NEXT:    s_and_b32 s91, s70, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s90, s70, 16
-; SI-NEXT:    s_and_b32 s93, s29, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s92, s29, 16
-; SI-NEXT:    s_and_b32 s95, s28, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s94, s28, 16
-; SI-NEXT:    s_and_b32 s31, s27, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s30, s27, 16
+; SI-NEXT:    s_and_b32 s79, s81, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s78, s81, 16
+; SI-NEXT:    s_and_b32 s89, s80, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s88, s80, 16
+; SI-NEXT:    s_and_b32 s91, s71, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s90, s71, 16
+; SI-NEXT:    s_and_b32 s93, s70, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s92, s70, 16
+; SI-NEXT:    s_and_b32 s95, s29, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s94, s29, 16
+; SI-NEXT:    s_and_b32 s31, s28, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s30, s28, 16
+; SI-NEXT:    s_and_b32 s35, s27, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s34, s27, 16
 ; SI-NEXT:    s_and_b32 s37, s26, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s36, s26, 16
 ; SI-NEXT:    s_and_b32 s39, s25, 0xffff0000
@@ -24803,8 +24792,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    s_lshl_b32 s66, s19, 16
 ; SI-NEXT:    s_and_b32 s69, s18, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s68, s18, 16
-; SI-NEXT:    s_and_b32 s35, s17, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s34, s17, 16
+; SI-NEXT:    s_and_b32 s76, s17, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s75, s17, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v21, s6, 2
 ; SI-NEXT:    s_lshl_b32 s6, s16, 16
@@ -24818,9 +24807,9 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s76
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s34
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s75
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -24888,56 +24877,56 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s34
+; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s31
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s30
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s95
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s94
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s93
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s92
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s91
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s90
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s89
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s88
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s79
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s78
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s77
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s76
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s75
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s77
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s74
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
@@ -28378,10 +28367,10 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v36
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s18
 ; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s22
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
@@ -28410,9 +28399,9 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_4
@@ -28423,13 +28412,12 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v5, v5, v8, 16
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v3, v3, v35, 16
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_alignbit_b32 v1, v1, v35, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_alignbit_b32 v4, v4, v9, 16
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_mov_b32_e32 v59, v2
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
@@ -28439,11 +28427,10 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v47, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_mov_b32_e32 v45, v12
-; SI-NEXT:    v_mov_b32_e32 v44, v14
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_alignbit_b32 v3, v3, v33, 16
+; SI-NEXT:    v_mov_b32_e32 v33, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_alignbit_b32 v1, v1, v33, 16
 ; SI-NEXT:    v_mov_b32_e32 v62, v38
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
@@ -28483,20 +28470,20 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_alignbit_b32 v28, v28, v37, 16
 ; SI-NEXT:    v_mov_b32_e32 v37, v34
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v35, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v43, v8
 ; SI-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v60, v9
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v42, v9
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v32
 ; SI-NEXT:    v_alignbit_b32 v31, v31, v34, 16
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v42, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v60, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    v_alignbit_b32 v8, v8, v9, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
@@ -28520,7 +28507,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_alignbit_b32 v12, v12, v14, 16
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v33, v14
+; SI-NEXT:    v_mov_b32_e32 v44, v14
 ; SI-NEXT:    v_alignbit_b32 v13, v13, v14, 16
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -28543,7 +28530,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:  .LBB19_2: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
@@ -28559,7 +28546,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v35
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v42
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v60
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v57
@@ -28571,7 +28558,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v45
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v33
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v36
@@ -28683,7 +28670,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
@@ -28706,7 +28693,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v43
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v42
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v58
@@ -28721,7 +28708,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v44
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v62
@@ -28807,16 +28794,16 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB19_4:
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v61, v53
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
@@ -28825,7 +28812,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v57, v11
 ; SI-NEXT:    v_mov_b32_e32 v47, v10
 ; SI-NEXT:    v_mov_b32_e32 v45, v12
-; SI-NEXT:    v_mov_b32_e32 v44, v14
+; SI-NEXT:    v_mov_b32_e32 v33, v14
 ; SI-NEXT:    v_mov_b32_e32 v62, v38
 ; SI-NEXT:    v_mov_b32_e32 v38, v39
 ; SI-NEXT:    v_mov_b32_e32 v39, v41
@@ -31891,96 +31878,36 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr55
 ; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
@@ -31990,99 +31917,173 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB20_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v63
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v29
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v4
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v28
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v5
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v3
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, v2
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, v1
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v63
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v31
@@ -32097,6 +32098,16 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
@@ -32104,37 +32115,28 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v29
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v60, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v61, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v62
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
@@ -32151,22 +32153,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr63
 ; SI-NEXT:  .LBB20_2: ; %Flow
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB20_4
@@ -32174,78 +32161,87 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v5
 ; SI-NEXT:    v_add_i32_e32 v33, vcc, 3, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v7
+; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v28
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v9
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_add_i32_e32 v25, vcc, 3, v25
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 3, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v50
+; SI-NEXT:    v_mov_b32_e32 v50, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v48
+; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v23
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v56
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v22
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v34
 ; SI-NEXT:    v_add_i32_e32 v21, vcc, 3, v21
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v21
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v32
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 3, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v20
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v46
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 3, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v19
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v61
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v18
 ; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v18
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
 ; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v57
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
 ; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v47
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_add_i32_e32 v26, vcc, 3, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v26
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v55
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v7
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 3, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v43
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_add_i32_e32 v40, vcc, 3, v63
-; SI-NEXT:    v_add_i32_e32 v42, vcc, 3, v62
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v40
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v42
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v42
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v62
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 3, v27
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_add_i32_e32 v42, vcc, 3, v63
+; SI-NEXT:    v_add_i32_e32 v44, vcc, 3, v62
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
@@ -32255,43 +32251,39 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 3, v13
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 3, v14
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 3, v15
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v22
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v23
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v27
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v42
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v44
 ; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v5
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v33
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v52
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
@@ -32308,58 +32300,53 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v36
-; SI-NEXT:    v_mov_b32_e32 v36, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v34
-; SI-NEXT:    v_mov_b32_e32 v34, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v44
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v45
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, v2
-; SI-NEXT:    v_mov_b32_e32 v50, v30
-; SI-NEXT:    v_mov_b32_e32 v55, v29
-; SI-NEXT:    v_mov_b32_e32 v44, v8
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v52, v29
+; SI-NEXT:    v_mov_b32_e32 v48, v30
+; SI-NEXT:    v_mov_b32_e32 v56, v28
+; SI-NEXT:    v_mov_b32_e32 v34, v7
+; SI-NEXT:    v_mov_b32_e32 v32, v6
+; SI-NEXT:    v_mov_b32_e32 v46, v8
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v1
 ; SI-NEXT:  .LBB20_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -32378,32 +32365,34 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v47
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v45
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v44
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
@@ -32412,25 +32401,25 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v51
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
@@ -32439,7 +32428,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
@@ -32448,7 +32437,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v39
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
@@ -32457,7 +32446,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v37
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
@@ -32466,7 +32455,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
@@ -32475,7 +32464,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -32485,8 +32474,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32496,8 +32485,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32507,8 +32496,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32518,8 +32507,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32529,8 +32518,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32540,8 +32529,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32551,8 +32540,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32562,8 +32551,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32573,8 +32562,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32584,8 +32573,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32595,8 +32584,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32605,48 +32594,46 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v46
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v56
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v36
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v31
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -37868,24 +37855,24 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v3
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
@@ -37906,23 +37893,23 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v38
 ; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v36
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v34
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
@@ -37935,46 +37922,45 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB27_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v7, v0, v58
+; SI-NEXT:    v_or_b32_e32 v7, v0, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v9, v0, v61
+; SI-NEXT:    v_or_b32_e32 v9, v0, v50
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v10, v0, v50
+; SI-NEXT:    v_or_b32_e32 v10, v0, v43
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v57
-; SI-NEXT:    v_or_b32_e32 v11, v0, v45
+; SI-NEXT:    v_or_b32_e32 v11, v0, v41
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v56
-; SI-NEXT:    v_or_b32_e32 v12, v0, v41
+; SI-NEXT:    v_or_b32_e32 v12, v0, v40
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v49
-; SI-NEXT:    v_mov_b32_e32 v36, v41
-; SI-NEXT:    v_mov_b32_e32 v41, v13
+; SI-NEXT:    v_mov_b32_e32 v52, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v0, v13
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
-; SI-NEXT:    v_mov_b32_e32 v51, v50
-; SI-NEXT:    v_mov_b32_e32 v50, v45
-; SI-NEXT:    v_mov_b32_e32 v45, v14
-; SI-NEXT:    v_or_b32_e32 v14, v0, v40
+; SI-NEXT:    v_mov_b32_e32 v36, v41
+; SI-NEXT:    v_mov_b32_e32 v41, v14
+; SI-NEXT:    v_or_b32_e32 v14, v0, v48
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v16
-; SI-NEXT:    v_mov_b32_e32 v52, v57
-; SI-NEXT:    v_mov_b32_e32 v57, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v15
+; SI-NEXT:    v_mov_b32_e32 v51, v50
+; SI-NEXT:    v_mov_b32_e32 v50, v43
+; SI-NEXT:    v_mov_b32_e32 v43, v48
+; SI-NEXT:    v_mov_b32_e32 v48, v15
 ; SI-NEXT:    v_or_b32_e32 v15, v0, v15
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v18
-; SI-NEXT:    v_mov_b32_e32 v34, v58
-; SI-NEXT:    v_mov_b32_e32 v58, v61
+; SI-NEXT:    v_mov_b32_e32 v38, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v16
 ; SI-NEXT:    v_or_b32_e32 v16, v0, v37
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v20
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_or_b32_e32 v17, v0, v17
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v22
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -38008,7 +37994,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s6, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s7, s21, 16
 ; SI-NEXT:    v_or_b32_e32 v26, v0, v26
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v59
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v45
 ; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_and_b32 s7, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s8, s23, 16
@@ -38019,7 +38005,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_mov_b32_e32 v33, v28
 ; SI-NEXT:    v_or_b32_e32 v28, v0, v5
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v43
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v59
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
@@ -38031,7 +38017,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s11, s29, 16
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v30, v0, v3
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v38
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v58
 ; SI-NEXT:    s_or_b32 s10, s10, s11
 ; SI-NEXT:    v_mov_b32_e32 v63, v2
 ; SI-NEXT:    v_mov_b32_e32 v32, v55
@@ -38039,9 +38025,9 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v55, v4
 ; SI-NEXT:    v_mov_b32_e32 v53, v6
 ; SI-NEXT:    v_mov_b32_e32 v47, v46
-; SI-NEXT:    v_mov_b32_e32 v59, v44
-; SI-NEXT:    v_mov_b32_e32 v43, v42
-; SI-NEXT:    v_or_b32_e32 v31, v0, v48
+; SI-NEXT:    v_mov_b32_e32 v45, v44
+; SI-NEXT:    v_mov_b32_e32 v59, v42
+; SI-NEXT:    v_or_b32_e32 v31, v0, v34
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
@@ -38051,13 +38037,12 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v6, s10
 ; SI-NEXT:    s_cbranch_execnz .LBB27_3
 ; SI-NEXT:  .LBB27_2: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v63
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v32, v1
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x30000, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v38, v43
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
@@ -38101,42 +38086,42 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v34, v0
+; SI-NEXT:    v_or_b32_e32 v0, v38, v0
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v58, v0
+; SI-NEXT:    v_or_b32_e32 v0, v51, v0
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v53
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v51, v0
+; SI-NEXT:    v_or_b32_e32 v0, v50, v0
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v52
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v50, v0
+; SI-NEXT:    v_or_b32_e32 v0, v36, v0
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v36, v0
+; SI-NEXT:    v_or_b32_e32 v0, v57, v0
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v57
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v40
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v41, v0
+; SI-NEXT:    v_or_b32_e32 v0, v49, v0
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v45
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v49, v0
+; SI-NEXT:    v_or_b32_e32 v0, v43, v0
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v56
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v40, v0
+; SI-NEXT:    v_or_b32_e32 v0, v48, v0
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 0x30000, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v37
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -38198,7 +38183,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v47
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -38214,12 +38199,12 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v59
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v45
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 0x30000, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
@@ -38228,7 +38213,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v38
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v59
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -38263,26 +38248,26 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB27_4:
-; SI-NEXT:    v_mov_b32_e32 v34, v58
+; SI-NEXT:    v_mov_b32_e32 v38, v61
 ; SI-NEXT:    v_mov_b32_e32 v32, v55
-; SI-NEXT:    v_mov_b32_e32 v58, v61
 ; SI-NEXT:    v_mov_b32_e32 v63, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v4
 ; SI-NEXT:    v_mov_b32_e32 v53, v6
 ; SI-NEXT:    v_mov_b32_e32 v52, v57
 ; SI-NEXT:    v_mov_b32_e32 v51, v50
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
-; SI-NEXT:    v_mov_b32_e32 v50, v45
+; SI-NEXT:    v_mov_b32_e32 v50, v43
 ; SI-NEXT:    v_mov_b32_e32 v36, v41
-; SI-NEXT:    v_mov_b32_e32 v41, v13
-; SI-NEXT:    v_mov_b32_e32 v57, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v15
-; SI-NEXT:    v_mov_b32_e32 v45, v14
+; SI-NEXT:    v_mov_b32_e32 v57, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v13
+; SI-NEXT:    v_mov_b32_e32 v43, v48
+; SI-NEXT:    v_mov_b32_e32 v48, v15
+; SI-NEXT:    v_mov_b32_e32 v41, v14
 ; SI-NEXT:    v_mov_b32_e32 v56, v16
 ; SI-NEXT:    v_mov_b32_e32 v47, v46
-; SI-NEXT:    v_mov_b32_e32 v59, v44
-; SI-NEXT:    v_mov_b32_e32 v43, v42
+; SI-NEXT:    v_mov_b32_e32 v45, v44
+; SI-NEXT:    v_mov_b32_e32 v59, v42
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v39, v23
@@ -58437,7 +58422,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:332
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:8
@@ -58463,7 +58448,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:168
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:176
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v15
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
@@ -58472,11 +58457,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v56, 8, v3
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v5
-; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v9
-; VI-NEXT:    v_lshlrev_b32_e32 v12, 8, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v59, 8, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -58497,7 +58482,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v41
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v40
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v55
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v54
@@ -58505,22 +58490,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v53
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v52
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v51
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v50
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v49
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v48
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v48
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v39
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v22
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v32
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
@@ -58543,24 +58528,23 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v38
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v26, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b32_e32 v23, 8, v13
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v54, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v9
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
@@ -58573,37 +58557,37 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:304
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v48, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v27, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v28, 8, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
 ; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
@@ -58611,23 +58595,23 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:100
 ; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:116
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:124
 ; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:140
 ; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:156
 ; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:164
 ; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:212
 ; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:268
 ; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:276
@@ -58640,57 +58624,57 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(12)
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB39_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58705,12 +58689,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58739,7 +58723,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v29, v9
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -58762,15 +58748,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v50, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58780,18 +58766,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v59, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v56, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v39, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58799,7 +58785,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v38, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v37, v0
@@ -58807,8 +58793,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v36, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58820,41 +58806,39 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v33, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v51, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v34, v22
+; VI-NEXT:    v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v32, v23
+; VI-NEXT:    v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v51, v3
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v43, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v43, v49
+; VI-NEXT:    v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v32, v54
+; VI-NEXT:    v_mov_b32_e32 v34, v26
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v49, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v0
@@ -58864,26 +58848,28 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v46, v61
 ; VI-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v47, v45
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v48, v28
-; VI-NEXT:    v_mov_b32_e32 v47, v58
-; VI-NEXT:    v_mov_b32_e32 v45, v44
-; VI-NEXT:    v_mov_b32_e32 v63, v42
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v58, v44
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_mov_b32_e32 v48, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v63, v42
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58899,8 +58885,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58934,43 +58920,44 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB39_3
 ; VI-NEXT:  .LBB39_2:
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v32, v54
+; VI-NEXT:    v_mov_b32_e32 v43, v49
 ; VI-NEXT:    v_mov_b32_e32 v46, v61
-; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v34, v22
-; VI-NEXT:    v_mov_b32_e32 v32, v23
-; VI-NEXT:    v_mov_b32_e32 v47, v58
-; VI-NEXT:    v_mov_b32_e32 v45, v44
+; VI-NEXT:    v_mov_b32_e32 v47, v45
+; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v34, v26
+; VI-NEXT:    v_mov_b32_e32 v58, v44
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_mov_b32_e32 v63, v42
 ; VI-NEXT:    v_mov_b32_e32 v51, v7
-; VI-NEXT:    v_mov_b32_e32 v48, v28
+; VI-NEXT:    v_mov_b32_e32 v48, v29
 ; VI-NEXT:    s_mov_b64 s[4:5], -1
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:  .LBB39_3: ; %Flow
 ; VI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v42, v45
-; VI-NEXT:    v_mov_b32_e32 v45, v46
+; VI-NEXT:    v_mov_b32_e32 v44, v47
+; VI-NEXT:    v_mov_b32_e32 v47, v46
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_mov_b32_e32 v46, v49
 ; VI-NEXT:    s_cbranch_vccnz .LBB39_5
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
@@ -59024,7 +59011,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x300, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
@@ -59033,8 +59020,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x3000000, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x3000000, v1
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -59047,8 +59034,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x3000000, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -59123,29 +59110,29 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -59157,8 +59144,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -59170,8 +59157,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -59183,8 +59170,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -59195,8 +59182,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -59206,8 +59193,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -59218,8 +59205,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -59229,63 +59216,63 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v20, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -59295,57 +59282,54 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v45
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v47
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v47
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v44
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v42
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v58
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v63
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x3000000, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v41
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v40
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -59630,7 +59614,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:164
 ; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:172
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:212
@@ -59638,11 +59622,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:252
 ; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:260
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:292
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:308
@@ -59668,7 +59652,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
@@ -59843,7 +59827,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -59855,7 +59839,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v57
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v60
 ; GFX9-NEXT:    v_mov_b32_e32 v52, v56
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -59864,14 +59848,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -59881,12 +59865,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v51, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -59940,7 +59924,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:  .LBB39_2:
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
@@ -60302,12 +60286,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v45
-; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x300, v0
 ; GFX9-NEXT:    v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v0, 3, v44
+; GFX9-NEXT:    v_add_u32_e32 v0, 3, v40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v55
@@ -60317,7 +60301,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v43
-; GFX9-NEXT:    v_add_u32_e32 v1, 3, v42
+; GFX9-NEXT:    v_add_u32_e32 v1, 3, v36
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x300, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -60326,7 +60310,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v0, 3, v36
+; GFX9-NEXT:    v_add_u32_e32 v0, 3, v42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v41
@@ -67146,10 +67130,10 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v36
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s18
 ; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s22
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
@@ -67178,9 +67162,9 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB43_4
@@ -67191,13 +67175,12 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v5, v5, v8, 16
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v3, v3, v35, 16
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_alignbit_b32 v1, v1, v35, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_alignbit_b32 v4, v4, v9, 16
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_mov_b32_e32 v59, v2
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
@@ -67207,11 +67190,10 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_mov_b32_e32 v47, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_mov_b32_e32 v45, v12
-; SI-NEXT:    v_mov_b32_e32 v44, v14
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_alignbit_b32 v3, v3, v33, 16
+; SI-NEXT:    v_mov_b32_e32 v33, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_alignbit_b32 v1, v1, v33, 16
 ; SI-NEXT:    v_mov_b32_e32 v62, v38
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
@@ -67251,20 +67233,20 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_alignbit_b32 v28, v28, v37, 16
 ; SI-NEXT:    v_mov_b32_e32 v37, v34
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v35, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v43, v8
 ; SI-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v60, v9
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v42, v9
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v32
 ; SI-NEXT:    v_alignbit_b32 v31, v31, v34, 16
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v42, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v60, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    v_alignbit_b32 v8, v8, v9, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
@@ -67288,7 +67270,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_alignbit_b32 v12, v12, v14, 16
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v33, v14
+; SI-NEXT:    v_mov_b32_e32 v44, v14
 ; SI-NEXT:    v_alignbit_b32 v13, v13, v14, 16
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -67311,7 +67293,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:  .LBB43_2: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
@@ -67327,7 +67309,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v35
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v42
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v60
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v57
@@ -67339,7 +67321,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v45
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v33
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v36
@@ -67451,7 +67433,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
@@ -67474,7 +67456,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v43
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v42
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v58
@@ -67489,7 +67471,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v44
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v62
@@ -67575,16 +67557,16 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB43_4:
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v61, v53
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
@@ -67593,7 +67575,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_mov_b32_e32 v57, v11
 ; SI-NEXT:    v_mov_b32_e32 v47, v10
 ; SI-NEXT:    v_mov_b32_e32 v45, v12
-; SI-NEXT:    v_mov_b32_e32 v44, v14
+; SI-NEXT:    v_mov_b32_e32 v33, v14
 ; SI-NEXT:    v_mov_b32_e32 v62, v38
 ; SI-NEXT:    v_mov_b32_e32 v38, v39
 ; SI-NEXT:    v_mov_b32_e32 v39, v41
@@ -70659,96 +70641,36 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr55
 ; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
@@ -70758,99 +70680,173 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB44_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v63
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v29
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v4
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v28
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v5
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v3
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, v2
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, v1
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v63
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v31
@@ -70865,6 +70861,16 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
@@ -70872,37 +70878,28 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v29
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v60, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v61, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v62
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
@@ -70919,22 +70916,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr63
 ; SI-NEXT:  .LBB44_2: ; %Flow
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB44_4
@@ -70942,80 +70924,89 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
 ; SI-NEXT:    v_add_f32_e32 v33, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
+; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v50
+; SI-NEXT:    v_mov_b32_e32 v50, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v48
+; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v24
+; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v23
+; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v22
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v56
+; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v21
 ; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
-; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v20
+; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v19
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v32
 ; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
 ; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v18
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v61
 ; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v46
 ; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
 ; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
 ; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v16
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v57
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v26
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v55
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v61
+; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
 ; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v7
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v43
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_add_f32_e32 v40, 1.0, v63
-; SI-NEXT:    v_add_f32_e32 v42, 1.0, v62
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v40
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v42
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_add_f32_e32 v27, 1.0, v27
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_add_f32_e32 v42, 1.0, v63
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v47
+; SI-NEXT:    v_add_f32_e32 v44, 1.0, v62
 ; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
 ; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
@@ -71023,40 +71014,39 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
 ; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
 ; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
-; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
-; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
-; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
-; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v27
 ; SI-NEXT:    v_add_f32_e32 v29, 1.0, v29
 ; SI-NEXT:    v_add_f32_e32 v30, 1.0, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v42
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v44
 ; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v33
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v52
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
@@ -71075,59 +71065,51 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v36
-; SI-NEXT:    v_mov_b32_e32 v36, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v34
-; SI-NEXT:    v_mov_b32_e32 v34, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v44
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v45
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, v2
-; SI-NEXT:    v_mov_b32_e32 v50, v30
-; SI-NEXT:    v_mov_b32_e32 v55, v29
-; SI-NEXT:    v_mov_b32_e32 v44, v8
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v1
+; SI-NEXT:    v_mov_b32_e32 v52, v29
+; SI-NEXT:    v_mov_b32_e32 v48, v30
+; SI-NEXT:    v_mov_b32_e32 v56, v28
+; SI-NEXT:    v_mov_b32_e32 v34, v7
+; SI-NEXT:    v_mov_b32_e32 v32, v6
+; SI-NEXT:    v_mov_b32_e32 v46, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v1
 ; SI-NEXT:  .LBB44_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -71146,32 +71128,34 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v47
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v45
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v44
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
@@ -71180,25 +71164,25 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v51
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
@@ -71207,7 +71191,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
@@ -71216,7 +71200,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v39
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
@@ -71225,7 +71209,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v37
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
@@ -71234,7 +71218,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
@@ -71243,7 +71227,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -71253,8 +71237,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71264,8 +71248,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71275,8 +71259,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71286,8 +71270,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71297,8 +71281,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71308,8 +71292,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71319,8 +71303,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71330,8 +71314,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71341,8 +71325,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71352,8 +71336,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71363,8 +71347,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -71373,48 +71357,46 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v46
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v56
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v36
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v31
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -71647,12 +71629,12 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    s_lshr_b32 s4, s12, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, s4
 ; SI-NEXT:    s_lshr_b32 s4, s13, 16
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, s4
 ; SI-NEXT:    s_lshr_b32 s4, s14, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, s4
 ; SI-NEXT:    s_lshr_b32 s4, s15, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, s4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, s4
 ; SI-NEXT:    s_lshr_b32 s4, s40, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, s4
 ; SI-NEXT:    s_lshr_b32 s4, s41, 16
@@ -71708,8 +71690,8 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, s6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, s7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, s8
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, s10
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, s11
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, s10
+; SI-NEXT:    v_cvt_f32_f16_e32 v61, s11
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, s15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, s40
@@ -71732,83 +71714,67 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, s20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, s19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, s18
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, s17
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, s17
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s16
 ; SI-NEXT:    s_cbranch_execnz .LBB45_3
 ; SI-NEXT:  .LBB45_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v1, s18, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e64 v2, s19, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    v_add_f32_e64 v4, s20, 1.0
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT:    v_add_f32_e64 v20, s15, 1.0
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    v_add_f32_e64 v18, s27, 1.0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v20
-; SI-NEXT:    v_add_f32_e64 v49, s8, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e64 v15, s12, 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_add_f32_e64 v1, s18, 1.0
 ; SI-NEXT:    v_add_f32_e64 v17, s13, 1.0
-; SI-NEXT:    v_add_f32_e64 v10, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v16, s26, 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v17
-; SI-NEXT:    v_add_f32_e64 v23, s29, 1.0
-; SI-NEXT:    v_add_f32_e64 v29, s45, 1.0
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_f32_e64 v2, s19, 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-NEXT:    v_add_f32_e64 v22, s40, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v16
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    v_add_f32_e64 v21, s28, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v22
 ; SI-NEXT:    v_add_f32_e64 v19, s14, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v21
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v52
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e64 v41, s6, 1.0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v41
-; SI-NEXT:    v_add_f32_e64 v8, s22, 1.0
+; SI-NEXT:    v_add_f32_e64 v6, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v10, s23, 1.0
 ; SI-NEXT:    v_add_f32_e64 v14, s25, 1.0
+; SI-NEXT:    v_add_f32_e64 v18, s27, 1.0
+; SI-NEXT:    v_add_f32_e64 v23, s29, 1.0
 ; SI-NEXT:    v_add_f32_e64 v27, s46, 1.0
-; SI-NEXT:    v_add_f32_e64 v24, s41, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; SI-NEXT:    v_add_f32_e64 v37, s10, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v37
+; SI-NEXT:    v_add_f32_e64 v26, s42, 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v23
+; SI-NEXT:    v_add_f32_e64 v25, s47, 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v17
 ; SI-NEXT:    v_add_f32_e64 v53, s7, 1.0
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v44
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v50
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v57
+; SI-NEXT:    v_add_f32_e64 v49, s8, 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v53
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -71816,16 +71782,17 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
 ; SI-NEXT:    v_add_f32_e64 v1, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s21, 1.0
 ; SI-NEXT:    v_add_f32_e64 v28, s43, 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v49
 ; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v41
 ; SI-NEXT:    v_add_f32_e64 v45, s9, 1.0
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v1
 ; SI-NEXT:    v_add_f32_e64 v1, s16, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v28
 ; SI-NEXT:    v_add_f32_e64 v34, s11, 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v45
@@ -71833,24 +71800,35 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v61, v34
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, v27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v62
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v6
+; SI-NEXT:    v_add_f32_e64 v4, s20, 1.0
+; SI-NEXT:    v_add_f32_e64 v8, s22, 1.0
 ; SI-NEXT:    v_add_f32_e64 v12, s24, 1.0
-; SI-NEXT:    v_add_f32_e64 v25, s47, 1.0
+; SI-NEXT:    v_add_f32_e64 v16, s26, 1.0
+; SI-NEXT:    v_add_f32_e64 v29, s45, 1.0
 ; SI-NEXT:    v_add_f32_e64 v30, s44, 1.0
-; SI-NEXT:    v_add_f32_e64 v26, s42, 1.0
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v25
+; SI-NEXT:    v_add_f32_e64 v24, s41, 1.0
+; SI-NEXT:    v_add_f32_e64 v20, s15, 1.0
+; SI-NEXT:    v_add_f32_e64 v37, s10, 1.0
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
 ; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v37
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
@@ -71858,7 +71836,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
@@ -71871,19 +71849,19 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, v61
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v60
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v46
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v52
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v2
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v3
 ; SI-NEXT:  .LBB45_3: ; %end
@@ -71895,7 +71873,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v60
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v63
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
@@ -72040,7 +72018,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v63
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v20
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -72057,7 +72035,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v13
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -72075,14 +72053,14 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v61
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v59
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -72139,7 +72117,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr58
@@ -72182,14 +72160,14 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr63
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr51
@@ -76593,24 +76571,24 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v3
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
@@ -76631,23 +76609,23 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v38
 ; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v36
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v34
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
@@ -76660,46 +76638,45 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v7, v0, v58
+; SI-NEXT:    v_or_b32_e32 v7, v0, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v9, v0, v61
+; SI-NEXT:    v_or_b32_e32 v9, v0, v50
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v10, v0, v50
+; SI-NEXT:    v_or_b32_e32 v10, v0, v43
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v57
-; SI-NEXT:    v_or_b32_e32 v11, v0, v45
+; SI-NEXT:    v_or_b32_e32 v11, v0, v41
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v56
-; SI-NEXT:    v_or_b32_e32 v12, v0, v41
+; SI-NEXT:    v_or_b32_e32 v12, v0, v40
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v49
-; SI-NEXT:    v_mov_b32_e32 v36, v41
-; SI-NEXT:    v_mov_b32_e32 v41, v13
+; SI-NEXT:    v_mov_b32_e32 v52, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v0, v13
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
-; SI-NEXT:    v_mov_b32_e32 v51, v50
-; SI-NEXT:    v_mov_b32_e32 v50, v45
-; SI-NEXT:    v_mov_b32_e32 v45, v14
-; SI-NEXT:    v_or_b32_e32 v14, v0, v40
+; SI-NEXT:    v_mov_b32_e32 v36, v41
+; SI-NEXT:    v_mov_b32_e32 v41, v14
+; SI-NEXT:    v_or_b32_e32 v14, v0, v48
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v16
-; SI-NEXT:    v_mov_b32_e32 v52, v57
-; SI-NEXT:    v_mov_b32_e32 v57, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v15
+; SI-NEXT:    v_mov_b32_e32 v51, v50
+; SI-NEXT:    v_mov_b32_e32 v50, v43
+; SI-NEXT:    v_mov_b32_e32 v43, v48
+; SI-NEXT:    v_mov_b32_e32 v48, v15
 ; SI-NEXT:    v_or_b32_e32 v15, v0, v15
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v18
-; SI-NEXT:    v_mov_b32_e32 v34, v58
-; SI-NEXT:    v_mov_b32_e32 v58, v61
+; SI-NEXT:    v_mov_b32_e32 v38, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v16
 ; SI-NEXT:    v_or_b32_e32 v16, v0, v37
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v20
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_or_b32_e32 v17, v0, v17
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v22
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -76733,7 +76710,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_and_b32 s6, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s7, s21, 16
 ; SI-NEXT:    v_or_b32_e32 v26, v0, v26
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v59
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v45
 ; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_and_b32 s7, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s8, s23, 16
@@ -76744,7 +76721,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_mov_b32_e32 v33, v28
 ; SI-NEXT:    v_or_b32_e32 v28, v0, v5
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v43
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v59
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
@@ -76756,7 +76733,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_lshl_b32 s11, s29, 16
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v30, v0, v3
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v38
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v58
 ; SI-NEXT:    s_or_b32 s10, s10, s11
 ; SI-NEXT:    v_mov_b32_e32 v63, v2
 ; SI-NEXT:    v_mov_b32_e32 v32, v55
@@ -76764,9 +76741,9 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v55, v4
 ; SI-NEXT:    v_mov_b32_e32 v53, v6
 ; SI-NEXT:    v_mov_b32_e32 v47, v46
-; SI-NEXT:    v_mov_b32_e32 v59, v44
-; SI-NEXT:    v_mov_b32_e32 v43, v42
-; SI-NEXT:    v_or_b32_e32 v31, v0, v48
+; SI-NEXT:    v_mov_b32_e32 v45, v44
+; SI-NEXT:    v_mov_b32_e32 v59, v42
+; SI-NEXT:    v_or_b32_e32 v31, v0, v34
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
@@ -76776,13 +76753,12 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v6, s10
 ; SI-NEXT:    s_cbranch_execnz .LBB51_3
 ; SI-NEXT:  .LBB51_2: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v63
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v32, v1
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x30000, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v38, v43
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
@@ -76826,42 +76802,42 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v34, v0
+; SI-NEXT:    v_or_b32_e32 v0, v38, v0
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v58, v0
+; SI-NEXT:    v_or_b32_e32 v0, v51, v0
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v53
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v51, v0
+; SI-NEXT:    v_or_b32_e32 v0, v50, v0
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v52
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v50, v0
+; SI-NEXT:    v_or_b32_e32 v0, v36, v0
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v36, v0
+; SI-NEXT:    v_or_b32_e32 v0, v57, v0
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v57
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v40
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v41, v0
+; SI-NEXT:    v_or_b32_e32 v0, v49, v0
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v45
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v49, v0
+; SI-NEXT:    v_or_b32_e32 v0, v43, v0
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v56
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v40, v0
+; SI-NEXT:    v_or_b32_e32 v0, v48, v0
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 0x30000, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v37
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -76923,7 +76899,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v47
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -76939,12 +76915,12 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v59
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v45
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 0x30000, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
@@ -76953,7 +76929,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v38
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v59
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -76988,26 +76964,26 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB51_4:
-; SI-NEXT:    v_mov_b32_e32 v34, v58
+; SI-NEXT:    v_mov_b32_e32 v38, v61
 ; SI-NEXT:    v_mov_b32_e32 v32, v55
-; SI-NEXT:    v_mov_b32_e32 v58, v61
 ; SI-NEXT:    v_mov_b32_e32 v63, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v4
 ; SI-NEXT:    v_mov_b32_e32 v53, v6
 ; SI-NEXT:    v_mov_b32_e32 v52, v57
 ; SI-NEXT:    v_mov_b32_e32 v51, v50
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
-; SI-NEXT:    v_mov_b32_e32 v50, v45
+; SI-NEXT:    v_mov_b32_e32 v50, v43
 ; SI-NEXT:    v_mov_b32_e32 v36, v41
-; SI-NEXT:    v_mov_b32_e32 v41, v13
-; SI-NEXT:    v_mov_b32_e32 v57, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v15
-; SI-NEXT:    v_mov_b32_e32 v45, v14
+; SI-NEXT:    v_mov_b32_e32 v57, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v13
+; SI-NEXT:    v_mov_b32_e32 v43, v48
+; SI-NEXT:    v_mov_b32_e32 v48, v15
+; SI-NEXT:    v_mov_b32_e32 v41, v14
 ; SI-NEXT:    v_mov_b32_e32 v56, v16
 ; SI-NEXT:    v_mov_b32_e32 v47, v46
-; SI-NEXT:    v_mov_b32_e32 v59, v44
-; SI-NEXT:    v_mov_b32_e32 v43, v42
+; SI-NEXT:    v_mov_b32_e32 v45, v44
+; SI-NEXT:    v_mov_b32_e32 v59, v42
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v39, v23
@@ -85947,7 +85923,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v16, s37, 5
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v17, s101, 5
-; GFX11-TRUE16-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s57, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
@@ -86125,7 +86101,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s42, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s18, 8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s57
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB57_3
 ; GFX11-TRUE16-NEXT:  .LBB57_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, 3
@@ -86798,9 +86774,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB57_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 0
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 0
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
@@ -86858,72 +86834,72 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr72_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 13
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 15
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 18
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 19
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 20
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 21
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 22
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s58, 31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s58, 0
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s58, 1
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 1
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 2
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 4
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 5
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 6
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 7
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 8
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 9
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 10
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 11
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 12
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 13
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 14
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 15
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 17
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 18
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 19
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 20
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 21
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 22
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 23
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 24
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 25
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 26
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 27
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 28
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 29
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 30
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v18, s57, 31
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s57, 0
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s57, 1
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s100, 2
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v19, s101, 3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr100_lo16
@@ -86975,7 +86951,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s5, v14
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v16, s37, 5
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s101, 5
-; GFX11-FAKE16-NEXT:    s_mov_b32 s44, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s101, 0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
@@ -87008,155 +86984,297 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v16, s85, 29
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v16, s86, 30
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v16, s87, 31
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB57_4
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB57_2
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[4:5], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 4
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s29, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s28, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 vcc_hi, s27, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s34, s27, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s35, s27, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s4, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s36, s26, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s26, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s25, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s4, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s25, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s25, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s24, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s50, s24, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s23, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s23, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 9
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s23, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s22, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s22, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 10
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s21, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s21, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s21, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 11
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s6, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s20, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s20, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s19, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 12
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s6, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s19, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s71, s19, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s18, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 13
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s18, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s17, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s17, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 14
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s17, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s16, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s16, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 15
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s3, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s96, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s3, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s8, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s2, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s25, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[4:5], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s5, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s5, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s5, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 17
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s24, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s57, s4, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s47, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s7, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 18
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s23, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 vcc_hi, s7, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s34, s7, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s6, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 19
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s56, s6, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s35, s9, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s36, s9, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 20
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s23, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s9, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s8, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s8, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 21
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s11, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s11, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s50, s11, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 22
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s22, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s10, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s10, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s13, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 23
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s21, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s13, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s13, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s12, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s12, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s15, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s15, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 25
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s21, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s15, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s14, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s14, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 26
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s41, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s71, s41, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s41, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 27
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s20, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s40, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s40, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s29, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 28
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s19, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s29, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s28, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 29
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s28, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s27, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s72, s27, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 30
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s19, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s27, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s73, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s96, s26, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s43, 31
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s25, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s18, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 14
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s74, s2, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s17, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 15
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[6:7], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s98, s1, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 12
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s1, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 17
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s8, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s100, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s101, s0, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 18
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[56:57], s[10:11], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[58:59], s[12:13], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[60:61], s[14:15], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 19
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[40:41], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[72:73], s[28:29], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s17, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 13
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[8:9], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s16, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 10
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[76:77], s[26:27], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 20
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 5
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s16, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 11
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[10:11], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[88:89], s[24:25], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[74:75], s[22:23], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 6
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s3, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 21
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s10, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[90:91], s[18:19], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 7
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s3, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 9
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[12:13], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[92:93], s[16:17], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s3, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 6
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[2:3], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 22
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s10, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[0:1], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 23
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 25
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 26
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s12, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 27
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s12, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 28
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 29
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 30
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 31
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s14, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s43, 9
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s2, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 7
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[14:15], 24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 5
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[40:41], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[28:29], 24
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 0
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s14, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s46, 2
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s47, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 2
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s46, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s47, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 4
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s40, 16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s40, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s29, 24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s29, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s44
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB57_3
-; GFX11-FAKE16-NEXT:  .LBB57_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[22:23], 24
+; GFX11-FAKE16-NEXT:    s_branch .LBB57_3
+; GFX11-FAKE16-NEXT:  .LBB57_2:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_hi
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    s_mov_b32 s101, -1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 0
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 2
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr94
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr92
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr90
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr62
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr104
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr88
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr76
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 10
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 12
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 14
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 15
+; GFX11-FAKE16-NEXT:  .LBB57_3: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s101
+; GFX11-FAKE16-NEXT:    s_mov_b32 s101, s104
+; GFX11-FAKE16-NEXT:    s_mov_b32 s104, s57
+; GFX11-FAKE16-NEXT:    s_mov_b32 s57, s69
+; GFX11-FAKE16-NEXT:    s_mov_b32 s69, s42
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB57_5
+; GFX11-FAKE16-NEXT:  ; %bb.4: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, 3
 ; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX11-FAKE16-NEXT:    s_add_u32 s2, s2, 3
@@ -87189,321 +87307,355 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_addc_u32 s7, s7, 0
 ; GFX11-FAKE16-NEXT:    s_add_u32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_addc_u32 s5, s5, 0
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s29, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[4:5], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 4
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s28, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 vcc_hi, s27, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s5, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s34, s27, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s35, s27, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s36, s26, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s4, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s26, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s25, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s25, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s4, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s25, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s24, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s50, s24, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s23, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s23, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s23, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 9
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s22, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s22, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s21, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 10
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s21, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s21, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s20, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 11
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s6, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s20, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s19, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s19, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 12
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s6, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s71, s19, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s18, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s18, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 13
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s17, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s17, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s17, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 14
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s16, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s16, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s3, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 15
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s9, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s96, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s3, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s2, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s25, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[4:5], 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s8, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s98, s1, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s100, s1, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s101, s5, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s5, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s5, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 17
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s8, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s101, s0, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[56:57], s[10:11], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s24, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s4, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s47, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s7, 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 18
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[58:59], s[12:13], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[60:61], s[14:15], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[40:41], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s23, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 vcc_hi, s7, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s34, s7, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s57, s6, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 19
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[72:73], s[28:29], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[76:77], s[26:27], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[88:89], s[24:25], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s56, s6, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s35, s9, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s36, s9, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 20
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s11, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[74:75], s[22:23], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[90:91], s[18:19], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s23, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s9, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s8, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s8, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 21
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s10, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[92:93], s[16:17], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[2:3], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[0:1], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s11, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s11, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s50, s11, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 22
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s10, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s22, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s10, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s10, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s13, 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 23
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s21, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s13, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s13, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s12, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s12, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s15, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s15, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 25
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s13, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s21, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s15, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s14, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s14, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 26
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s12, 16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s41, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s71, s41, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s41, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 27
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s12, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s20, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s40, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s40, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s29, 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 28
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s19, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s29, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s28, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 29
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s28, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s27, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s72, s27, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 30
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s15, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s19, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s27, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s73, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s96, s26, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s42, 31
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s14, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s25, 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 0
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s14, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s46, 2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s18, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 14
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s74, s2, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s47, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s17, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 15
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[6:7], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s2, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 2
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s46, 0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s17, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 12
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s98, s1, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s1, 16
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s41, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s47, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s17, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 13
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[8:9], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s100, s1, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 4
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s40, 16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s16, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 10
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s40, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s16, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 11
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[10:11], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[76:77], s[26:27], 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s29, 24
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s3, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[88:89], s[24:25], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s3, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 9
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[12:13], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[90:91], s[18:19], 24
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s2, 16
-; GFX11-FAKE16-NEXT:  .LBB57_3: ; %end
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s101, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s45, s45, 0xff
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s44
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s30, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s3, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 6
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[92:93], s[16:17], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[2:3], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[0:1], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v18, s42, 9
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 7
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[14:15], 24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 5
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[40:41], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[28:29], 24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s62, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, s63, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[62:63], s[22:23], 24
+; GFX11-FAKE16-NEXT:  .LBB57_5: ; %end
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s43, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s44, s45, s44
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s44, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s42, s74, 0xff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s43
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s94, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s42, s42, 0xff
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_or_b32 s42, s42, s43
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s45, s45, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s42, s42, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s42
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s42, v18, 9
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s45, s30, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s44, 0xff
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX11-FAKE16-NEXT:    s_or_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s42, s42, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s44, 16
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s42
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s42, v18, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s43, v18, 7
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s44
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s100, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s42, s42, s43
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s45, s98, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s44
 ; GFX11-FAKE16-NEXT:    s_and_b32 s44, s99, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s45, s98, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s42, s42, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s42, s42, 0xff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s44, s44, s45
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s42
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s42, s97, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s43, 8
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s44, s44, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s42
-; GFX11-FAKE16-NEXT:    s_and_b32 s42, s96, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s87, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s44
 ; GFX11-FAKE16-NEXT:    s_or_b32 s42, s42, s43
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s44
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s42, s42, 16
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s86, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v18, 6
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s42
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v18, 5
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v18, 2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s92, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s85, 0xff
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s84, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v18, 4
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s82, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s17, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s80, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v18, 0
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s83, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v18, 3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v19, 29
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s100, v17, 4
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s99, v17, 3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s18, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s69, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s81, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v18, 1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v19, 28
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s18, v19, 19
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s90, 8
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s3
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s16, s71, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v19, 31
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s68, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s19, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s16, s16, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s16
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s70, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v19, 30
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s78, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s18, s18, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s19, s86, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s16, s16, s17
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v19, 21
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s16, s16, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s78, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s98, v17, 2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s16
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s67, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 27
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 24
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v19, 22
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 8
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s66, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 26
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s64, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s21, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s54, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s86, v16, 30
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s65, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 25
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v16, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v16, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s22, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s53, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s55, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s18, s51, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s1, v19, 18
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s74, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s62, 8
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s23, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s16, s16, s17
-; GFX11-FAKE16-NEXT:    s_and_b32 s17, s52, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v19, 20
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT:    s_or_b32 s17, s17, s18
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s16, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s17, s17, 0xff
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_or_b32 s17, s17, s18
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s16, s17, 16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s97, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s16
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s50, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s49, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 17
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s88, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s69, 0xff
+; GFX11-FAKE16-NEXT:    s_and_b32 s18, s72, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s97, v17, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s69, v16, 21
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s25, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s48, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s39, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s38, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s73, 0xff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s37, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s36, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s96, 8
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s76, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
 ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s27, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s35, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s18, s34, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s19, vcc_hi, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s87, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_or_b32 s16, s16, s17
 ; GFX11-FAKE16-NEXT:    s_or_b32 s17, s18, s19
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v18, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v18, 7
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v19, 0
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s104, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s103, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s72, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s85, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s84, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s16, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v19, 1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s29, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s102, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s83, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s82, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s81, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s18, v19, 2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
@@ -87512,176 +87664,147 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v18, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v18, 5
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s40, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s62, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s18, v18, 3
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s61, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s16, s80, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s18, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s19, v19, 3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v18, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s19, v18, 2
 ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s41, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s18, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s19, s19, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s60, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s18, s71, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s19, s70, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s16, s16, s17
 ; GFX11-FAKE16-NEXT:    s_or_b32 s17, s18, s19
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s16, s16, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s16, s17
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s16, v19, 4
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s1, v18, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v18, 0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s14, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s60, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s14, v19, 30
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s58, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s59, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s16, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s15, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 31
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s15, v19, 29
-; GFX11-FAKE16-NEXT:    s_and_b32 s14, s14, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s15, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s68, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s14, s67, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s66, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s14, s15
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s14, v19, 6
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s12, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 28
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s12, v19, 27
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s14, s58, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s15, v19, 24
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s12, s12, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s65, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s12, s64, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s15, v19, 7
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s12, s14
 ; GFX11-FAKE16-NEXT:    s_and_b32 s12, s13, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s13, v19, 26
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s14, v19, 25
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s15, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s14, s14, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s13, s55, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s14, s54, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s53, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s12, s12, s13
 ; GFX11-FAKE16-NEXT:    s_or_b32 s13, s14, s15
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s12, s12, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s13, s13, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s12, s13
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s12, v19, 8
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s1, v19, 23
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 22
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s10, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s56, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s10, v19, 20
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s52, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s51, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s12, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s11, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 21
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s11, v19, 19
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s10, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s50, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s10, s49, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s48, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s10, s11
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s10, v19, 10
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s8, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 18
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s8, v19, 17
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s46, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s11, v19, 14
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s39, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s38, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s11, v19, 11
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s8, s10
 ; GFX11-FAKE16-NEXT:    s_and_b32 s8, s9, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s9, v19, 16
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s10, v19, 15
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s10, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s37, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s10, s36, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s35, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s8, s8, s9
 ; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s8, s9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s8, v19, 12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s1, v19, 13
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s2, v19, 12
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s8, v19, 0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s6, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s6, v19, 10
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s56, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s57, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s8, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s7, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 11
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s7, v19, 9
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s34, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, vcc_hi, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s46, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s6, s7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s6, v19, 14
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s4, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s3, v19, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s4, v19, 7
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s6, v19, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s7, v19, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s7, v19, 4
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s47, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s104, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s7, v19, 15
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s6
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s5, 0xff
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s5, v19, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s6, v19, 5
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s103, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s102, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s101, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s4, s4, s5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
@@ -87689,7 +87812,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:64
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s9, v19, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s17, v19, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s13, v19, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s9, v19, 13
 ; GFX11-FAKE16-NEXT:    s_clause 0x2
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:96
@@ -87698,13 +87823,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s103, v17, 7
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s102, v17, 6
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s101, v17, 5
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s100, v17, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s99, v17, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s98, v17, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s97, v17, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s96, v17, 0
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s87, v16, 31
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s86, v16, 30
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s85, v16, 29
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s84, v16, 28
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s83, v16, 27
@@ -87713,7 +87833,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s80, v16, 24
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s71, v16, 23
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s70, v16, 22
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s69, v16, 21
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s68, v16, 20
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s67, v16, 19
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s66, v16, 18
@@ -87733,8 +87852,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v16, 4
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v16, 3
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v16, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v16, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v16, 0
 ; GFX11-FAKE16-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-FAKE16-NEXT:    s_clause 0x3
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v16, off, s32
@@ -87744,145 +87861,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB57_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_hi
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 0
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr101
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr30
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr100
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr99
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr98
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr94
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr97
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr96
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr87
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr86
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr85
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr92
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr84
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr83
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr82
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr81
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr80
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr90
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr71
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr70
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr69
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr68
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr67
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr78
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr66
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr65
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr64
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr55
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr54
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr74
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr53
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr52
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr51
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr50
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr49
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr48
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr39
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr38
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr37
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr36
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr35
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr34
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr104
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr103
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr102
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr88
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr76
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr72
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr62
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr60
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr58
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr56
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 1
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vcc_lo
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_lo, 2
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v19, vcc_hi, 3
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
-; GFX11-FAKE16-NEXT:    s_branch .LBB57_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -95182,7 +95160,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:332
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:8
@@ -95208,7 +95186,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:168
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:176
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v15
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
@@ -95217,11 +95195,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v56, 8, v3
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v5
-; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v9
-; VI-NEXT:    v_lshlrev_b32_e32 v12, 8, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v59, 8, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -95242,7 +95220,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v41
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v40
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v55
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v54
@@ -95250,22 +95228,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v53
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v52
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v51
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v50
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v49
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v48
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v48
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v39
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v22
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v32
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
@@ -95288,24 +95266,23 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v38
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v26, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b32_e32 v23, 8, v13
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v54, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v9
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
@@ -95318,37 +95295,37 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:304
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v48, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v27, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v28, 8, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
 ; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
@@ -95356,23 +95333,23 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:100
 ; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:116
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:124
 ; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:140
 ; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:156
 ; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:164
 ; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:212
 ; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:268
 ; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:276
@@ -95385,57 +95362,57 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(12)
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB59_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -95450,12 +95427,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -95484,7 +95461,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v29, v9
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -95507,15 +95486,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v50, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -95525,18 +95504,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v59, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v56, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v39, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -95544,7 +95523,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v38, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v37, v0
@@ -95552,8 +95531,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v36, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -95565,41 +95544,39 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v33, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v51, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v34, v22
+; VI-NEXT:    v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v32, v23
+; VI-NEXT:    v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v51, v3
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v43, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v43, v49
+; VI-NEXT:    v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v32, v54
+; VI-NEXT:    v_mov_b32_e32 v34, v26
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v49, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v0
@@ -95609,26 +95586,28 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v46, v61
 ; VI-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v47, v45
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v48, v28
-; VI-NEXT:    v_mov_b32_e32 v47, v58
-; VI-NEXT:    v_mov_b32_e32 v45, v44
-; VI-NEXT:    v_mov_b32_e32 v63, v42
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v58, v44
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_mov_b32_e32 v48, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v63, v42
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -95644,8 +95623,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -95679,43 +95658,44 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB59_3
 ; VI-NEXT:  .LBB59_2:
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v32, v54
+; VI-NEXT:    v_mov_b32_e32 v43, v49
 ; VI-NEXT:    v_mov_b32_e32 v46, v61
-; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v34, v22
-; VI-NEXT:    v_mov_b32_e32 v32, v23
-; VI-NEXT:    v_mov_b32_e32 v47, v58
-; VI-NEXT:    v_mov_b32_e32 v45, v44
+; VI-NEXT:    v_mov_b32_e32 v47, v45
+; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v34, v26
+; VI-NEXT:    v_mov_b32_e32 v58, v44
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_mov_b32_e32 v63, v42
 ; VI-NEXT:    v_mov_b32_e32 v51, v7
-; VI-NEXT:    v_mov_b32_e32 v48, v28
+; VI-NEXT:    v_mov_b32_e32 v48, v29
 ; VI-NEXT:    s_mov_b64 s[4:5], -1
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:  .LBB59_3: ; %Flow
 ; VI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v42, v45
-; VI-NEXT:    v_mov_b32_e32 v45, v46
+; VI-NEXT:    v_mov_b32_e32 v44, v47
+; VI-NEXT:    v_mov_b32_e32 v47, v46
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_mov_b32_e32 v46, v49
 ; VI-NEXT:    s_cbranch_vccnz .LBB59_5
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
@@ -95769,7 +95749,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x300, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
@@ -95778,8 +95758,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x3000000, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x3000000, v1
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -95792,8 +95772,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x3000000, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -95868,29 +95848,29 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -95902,8 +95882,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -95915,8 +95895,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -95928,8 +95908,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -95940,8 +95920,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -95951,8 +95931,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -95963,8 +95943,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -95974,63 +95954,63 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v20, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -96040,57 +96020,54 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v45
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v47
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v47
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v44
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v42
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v58
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v63
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x3000000, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v41
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v40
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -96375,7 +96352,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:164
 ; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:172
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:212
@@ -96383,11 +96360,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:252
 ; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:260
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:292
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:308
@@ -96413,7 +96390,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
@@ -96588,7 +96565,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -96600,7 +96577,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v57
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v60
 ; GFX9-NEXT:    v_mov_b32_e32 v52, v56
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -96609,14 +96586,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -96626,12 +96603,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v51, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -96685,7 +96662,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:  .LBB59_2:
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
@@ -97047,12 +97024,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v45
-; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x300, v0
 ; GFX9-NEXT:    v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v0, 3, v44
+; GFX9-NEXT:    v_add_u32_e32 v0, 3, v40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v55
@@ -97062,7 +97039,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v43
-; GFX9-NEXT:    v_add_u32_e32 v1, 3, v42
+; GFX9-NEXT:    v_add_u32_e32 v1, 3, v36
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x300, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -97071,7 +97048,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v0, 3, v36
+; GFX9-NEXT:    v_add_u32_e32 v0, 3, v42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v41
@@ -103833,10 +103810,10 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v36
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s18
 ; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s22
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
@@ -103865,9 +103842,9 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB63_4
@@ -103878,13 +103855,12 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v5, v5, v8, 16
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v3, v3, v35, 16
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_alignbit_b32 v1, v1, v35, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_alignbit_b32 v4, v4, v9, 16
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_mov_b32_e32 v59, v2
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
@@ -103894,11 +103870,10 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v47, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_mov_b32_e32 v45, v12
-; SI-NEXT:    v_mov_b32_e32 v44, v14
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_alignbit_b32 v3, v3, v33, 16
+; SI-NEXT:    v_mov_b32_e32 v33, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_alignbit_b32 v1, v1, v33, 16
 ; SI-NEXT:    v_mov_b32_e32 v62, v38
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
@@ -103938,20 +103913,20 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_alignbit_b32 v28, v28, v37, 16
 ; SI-NEXT:    v_mov_b32_e32 v37, v34
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v35, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v43, v8
 ; SI-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v60, v9
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v42, v9
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v32
 ; SI-NEXT:    v_alignbit_b32 v31, v31, v34, 16
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v42, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v60, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    v_alignbit_b32 v8, v8, v9, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
@@ -103975,7 +103950,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_alignbit_b32 v12, v12, v14, 16
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v33, v14
+; SI-NEXT:    v_mov_b32_e32 v44, v14
 ; SI-NEXT:    v_alignbit_b32 v13, v13, v14, 16
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -103998,7 +103973,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:  .LBB63_2: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
@@ -104014,7 +103989,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v35
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v42
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v60
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v57
@@ -104026,7 +104001,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v45
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v33
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v36
@@ -104138,7 +104113,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
@@ -104161,7 +104136,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v43
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v42
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v58
@@ -104176,7 +104151,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v44
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v62
@@ -104262,16 +104237,16 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB63_4:
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v61, v53
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
@@ -104280,7 +104255,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v57, v11
 ; SI-NEXT:    v_mov_b32_e32 v47, v10
 ; SI-NEXT:    v_mov_b32_e32 v45, v12
-; SI-NEXT:    v_mov_b32_e32 v44, v14
+; SI-NEXT:    v_mov_b32_e32 v33, v14
 ; SI-NEXT:    v_mov_b32_e32 v62, v38
 ; SI-NEXT:    v_mov_b32_e32 v38, v39
 ; SI-NEXT:    v_mov_b32_e32 v39, v41
@@ -107346,98 +107321,36 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; kill: killed $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
@@ -107445,104 +107358,175 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB64_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v63
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v4
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v29
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v28
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v5
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v3
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v63
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, v2
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v29
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, v1
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v31
@@ -107555,8 +107539,18 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
@@ -107564,33 +107558,28 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v60, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v61, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v62
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
@@ -107607,22 +107596,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr63
 ; SI-NEXT:  .LBB64_2: ; %Flow
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB64_4
@@ -107632,9 +107606,9 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
 ; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v7
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v9
 ; SI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
@@ -107645,111 +107619,114 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 3, v15
 ; SI-NEXT:    v_addc_u32_e32 v16, vcc, 0, v16, vcc
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
 ; SI-NEXT:    v_addc_u32_e32 v18, vcc, 0, v18, vcc
+; SI-NEXT:    v_add_i32_e32 v19, vcc, 3, v19
+; SI-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
+; SI-NEXT:    v_add_i32_e32 v21, vcc, 3, v21
+; SI-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
+; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v9
+; SI-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_add_i32_e32 v25, vcc, 3, v25
+; SI-NEXT:    v_addc_u32_e32 v26, vcc, 0, v26, vcc
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; SI-NEXT:    v_add_i32_e32 v27, vcc, 3, v27
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v50
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 3, v19
+; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v25
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; SI-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 3, v21
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v48
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v22
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT:    v_mov_b32_e32 v38, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v46
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v18
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
 ; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v16
-; SI-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v32
+; SI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_add_i32_e32 v42, vcc, 3, v63
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v14
+; SI-NEXT:    v_addc_u32_e32 v44, vcc, 0, v62, vcc
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v61
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v23
-; SI-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v42
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v44
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 3, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v57
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
-; SI-NEXT:    v_addc_u32_e32 v26, vcc, 0, v26, vcc
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v47
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 3, v27
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v28
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v52
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v41
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
-; SI-NEXT:    v_mov_b32_e32 v55, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, v3
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v47
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, v43
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
-; SI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_add_i32_e32 v53, vcc, 3, v63
-; SI-NEXT:    v_addc_u32_e32 v42, vcc, 0, v62, vcc
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v53
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v63
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v62
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v62
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
@@ -107767,59 +107744,51 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v36
-; SI-NEXT:    v_mov_b32_e32 v36, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v34
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v34, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v45
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, v6
-; SI-NEXT:    v_mov_b32_e32 v52, v30
-; SI-NEXT:    v_mov_b32_e32 v44, v29
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v50, v29
+; SI-NEXT:    v_mov_b32_e32 v48, v30
+; SI-NEXT:    v_mov_b32_e32 v46, v28
+; SI-NEXT:    v_mov_b32_e32 v34, v8
+; SI-NEXT:    v_mov_b32_e32 v32, v7
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v1
 ; SI-NEXT:  .LBB64_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -107845,59 +107814,63 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v47
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v45
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v44
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v53
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v51
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
@@ -107906,7 +107879,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v39
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
@@ -107915,7 +107888,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v37
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
@@ -107924,7 +107897,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
@@ -107933,7 +107906,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -107943,20 +107916,9 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -107967,7 +107929,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -107978,7 +107940,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -107989,7 +107951,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -108000,7 +107962,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -108011,7 +107973,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -108020,9 +107982,9 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -108032,8 +107994,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -108044,7 +108006,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -108055,7 +108017,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -108066,7 +108028,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -108076,8 +108038,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -108085,28 +108047,35 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v46
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v50
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v36
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v31
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -113358,24 +113327,24 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v3
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
@@ -113396,23 +113365,23 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v38
 ; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v36
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v34
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
@@ -113425,46 +113394,45 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB71_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v7, v0, v58
+; SI-NEXT:    v_or_b32_e32 v7, v0, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v9, v0, v61
+; SI-NEXT:    v_or_b32_e32 v9, v0, v50
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v10, v0, v50
+; SI-NEXT:    v_or_b32_e32 v10, v0, v43
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v57
-; SI-NEXT:    v_or_b32_e32 v11, v0, v45
+; SI-NEXT:    v_or_b32_e32 v11, v0, v41
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v56
-; SI-NEXT:    v_or_b32_e32 v12, v0, v41
+; SI-NEXT:    v_or_b32_e32 v12, v0, v40
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v49
-; SI-NEXT:    v_mov_b32_e32 v36, v41
-; SI-NEXT:    v_mov_b32_e32 v41, v13
+; SI-NEXT:    v_mov_b32_e32 v52, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v0, v13
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
-; SI-NEXT:    v_mov_b32_e32 v51, v50
-; SI-NEXT:    v_mov_b32_e32 v50, v45
-; SI-NEXT:    v_mov_b32_e32 v45, v14
-; SI-NEXT:    v_or_b32_e32 v14, v0, v40
+; SI-NEXT:    v_mov_b32_e32 v36, v41
+; SI-NEXT:    v_mov_b32_e32 v41, v14
+; SI-NEXT:    v_or_b32_e32 v14, v0, v48
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v16
-; SI-NEXT:    v_mov_b32_e32 v52, v57
-; SI-NEXT:    v_mov_b32_e32 v57, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v15
+; SI-NEXT:    v_mov_b32_e32 v51, v50
+; SI-NEXT:    v_mov_b32_e32 v50, v43
+; SI-NEXT:    v_mov_b32_e32 v43, v48
+; SI-NEXT:    v_mov_b32_e32 v48, v15
 ; SI-NEXT:    v_or_b32_e32 v15, v0, v15
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v18
-; SI-NEXT:    v_mov_b32_e32 v34, v58
-; SI-NEXT:    v_mov_b32_e32 v58, v61
+; SI-NEXT:    v_mov_b32_e32 v38, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v16
 ; SI-NEXT:    v_or_b32_e32 v16, v0, v37
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v20
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_or_b32_e32 v17, v0, v17
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v22
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -113498,7 +113466,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s6, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s7, s21, 16
 ; SI-NEXT:    v_or_b32_e32 v26, v0, v26
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v59
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v45
 ; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_and_b32 s7, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s8, s23, 16
@@ -113509,7 +113477,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_mov_b32_e32 v33, v28
 ; SI-NEXT:    v_or_b32_e32 v28, v0, v5
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v43
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v59
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
@@ -113521,7 +113489,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s11, s29, 16
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v30, v0, v3
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v38
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v58
 ; SI-NEXT:    s_or_b32 s10, s10, s11
 ; SI-NEXT:    v_mov_b32_e32 v63, v2
 ; SI-NEXT:    v_mov_b32_e32 v32, v55
@@ -113529,9 +113497,9 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v55, v4
 ; SI-NEXT:    v_mov_b32_e32 v53, v6
 ; SI-NEXT:    v_mov_b32_e32 v47, v46
-; SI-NEXT:    v_mov_b32_e32 v59, v44
-; SI-NEXT:    v_mov_b32_e32 v43, v42
-; SI-NEXT:    v_or_b32_e32 v31, v0, v48
+; SI-NEXT:    v_mov_b32_e32 v45, v44
+; SI-NEXT:    v_mov_b32_e32 v59, v42
+; SI-NEXT:    v_or_b32_e32 v31, v0, v34
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
@@ -113541,13 +113509,12 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v6, s10
 ; SI-NEXT:    s_cbranch_execnz .LBB71_3
 ; SI-NEXT:  .LBB71_2: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v63
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v32, v1
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x30000, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v38, v43
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
@@ -113591,42 +113558,42 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v34, v0
+; SI-NEXT:    v_or_b32_e32 v0, v38, v0
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v58, v0
+; SI-NEXT:    v_or_b32_e32 v0, v51, v0
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v53
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v51, v0
+; SI-NEXT:    v_or_b32_e32 v0, v50, v0
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v52
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v50, v0
+; SI-NEXT:    v_or_b32_e32 v0, v36, v0
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v36, v0
+; SI-NEXT:    v_or_b32_e32 v0, v57, v0
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v57
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v40
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v41, v0
+; SI-NEXT:    v_or_b32_e32 v0, v49, v0
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v45
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v49, v0
+; SI-NEXT:    v_or_b32_e32 v0, v43, v0
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v56
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v40, v0
+; SI-NEXT:    v_or_b32_e32 v0, v48, v0
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 0x30000, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v37
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -113688,7 +113655,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v47
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -113704,12 +113671,12 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v59
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v45
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 0x30000, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
@@ -113718,7 +113685,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v38
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v59
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -113753,26 +113720,26 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB71_4:
-; SI-NEXT:    v_mov_b32_e32 v34, v58
+; SI-NEXT:    v_mov_b32_e32 v38, v61
 ; SI-NEXT:    v_mov_b32_e32 v32, v55
-; SI-NEXT:    v_mov_b32_e32 v58, v61
 ; SI-NEXT:    v_mov_b32_e32 v63, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v4
 ; SI-NEXT:    v_mov_b32_e32 v53, v6
 ; SI-NEXT:    v_mov_b32_e32 v52, v57
 ; SI-NEXT:    v_mov_b32_e32 v51, v50
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
-; SI-NEXT:    v_mov_b32_e32 v50, v45
+; SI-NEXT:    v_mov_b32_e32 v50, v43
 ; SI-NEXT:    v_mov_b32_e32 v36, v41
-; SI-NEXT:    v_mov_b32_e32 v41, v13
-; SI-NEXT:    v_mov_b32_e32 v57, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v15
-; SI-NEXT:    v_mov_b32_e32 v45, v14
+; SI-NEXT:    v_mov_b32_e32 v57, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v13
+; SI-NEXT:    v_mov_b32_e32 v43, v48
+; SI-NEXT:    v_mov_b32_e32 v48, v15
+; SI-NEXT:    v_mov_b32_e32 v41, v14
 ; SI-NEXT:    v_mov_b32_e32 v56, v16
 ; SI-NEXT:    v_mov_b32_e32 v47, v46
-; SI-NEXT:    v_mov_b32_e32 v59, v44
-; SI-NEXT:    v_mov_b32_e32 v43, v42
+; SI-NEXT:    v_mov_b32_e32 v45, v44
+; SI-NEXT:    v_mov_b32_e32 v59, v42
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v39, v23
@@ -120356,7 +120323,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v29
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v1
@@ -120374,7 +120341,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v30
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v30
 ; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v30
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v36, 24, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v32
@@ -120542,9 +120509,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v33, s71
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s69
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s68
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v33, s68
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s67
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s66
@@ -120866,12 +120833,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v31
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v32
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -131876,7 +131843,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:332
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:8
@@ -131902,7 +131869,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:168
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:176
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v15
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
@@ -131911,11 +131878,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v56, 8, v3
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v5
-; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v9
-; VI-NEXT:    v_lshlrev_b32_e32 v12, 8, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v59, 8, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -131936,7 +131903,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v41
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v40
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v55
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v54
@@ -131944,22 +131911,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v53
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v52
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v51
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v50
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v49
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v48
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v48
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v39
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v22
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v32
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
@@ -131982,24 +131949,23 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 8, v38
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v26, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b32_e32 v23, 8, v13
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v54, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v9
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
@@ -132012,37 +131978,37 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:304
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v48, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v27, 8, v13
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v11
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v28, 8, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:328
 ; VI-NEXT:    buffer_load_ushort v11, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v12, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v13, off, s[0:3], s32 offset:36
 ; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v29, 8, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:84
@@ -132050,23 +132016,23 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:100
 ; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:116
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:124
 ; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:140
 ; VI-NEXT:    buffer_load_ushort v20, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:156
 ; VI-NEXT:    buffer_load_ushort v21, off, s[0:3], s32 offset:164
 ; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:188
 ; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:212
 ; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:220
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:260
 ; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:268
 ; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:276
@@ -132079,57 +132045,57 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(12)
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB75_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_or_b32_sdwa v0, v2, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -132144,12 +132110,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -132178,7 +132144,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v29, v9
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -132201,15 +132169,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v50, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -132219,18 +132187,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v59, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v56, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v39, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -132238,7 +132206,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v38, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v37, v0
@@ -132246,8 +132214,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v36, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -132259,41 +132227,39 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v33, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v51, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v25, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v34, v22
+; VI-NEXT:    v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v43, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v32, v23
+; VI-NEXT:    v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v51, v3
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v43, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v43, v49
+; VI-NEXT:    v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v32, v54
+; VI-NEXT:    v_mov_b32_e32 v34, v26
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v49, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v0
@@ -132303,26 +132269,28 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v46, v61
 ; VI-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v58, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v45, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v47, v45
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v0, v63, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v48, v28
-; VI-NEXT:    v_mov_b32_e32 v47, v58
-; VI-NEXT:    v_mov_b32_e32 v45, v44
-; VI-NEXT:    v_mov_b32_e32 v63, v42
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v58, v44
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_mov_b32_e32 v48, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v63, v42
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -132338,8 +132306,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -132373,43 +132341,44 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB75_3
 ; VI-NEXT:  .LBB75_2:
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v32, v54
+; VI-NEXT:    v_mov_b32_e32 v43, v49
 ; VI-NEXT:    v_mov_b32_e32 v46, v61
-; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v34, v22
-; VI-NEXT:    v_mov_b32_e32 v32, v23
-; VI-NEXT:    v_mov_b32_e32 v47, v58
-; VI-NEXT:    v_mov_b32_e32 v45, v44
+; VI-NEXT:    v_mov_b32_e32 v47, v45
+; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v34, v26
+; VI-NEXT:    v_mov_b32_e32 v58, v44
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_mov_b32_e32 v63, v42
 ; VI-NEXT:    v_mov_b32_e32 v51, v7
-; VI-NEXT:    v_mov_b32_e32 v48, v28
+; VI-NEXT:    v_mov_b32_e32 v48, v29
 ; VI-NEXT:    s_mov_b64 s[4:5], -1
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:  .LBB75_3: ; %Flow
 ; VI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v42, v45
-; VI-NEXT:    v_mov_b32_e32 v45, v46
+; VI-NEXT:    v_mov_b32_e32 v44, v47
+; VI-NEXT:    v_mov_b32_e32 v47, v46
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_mov_b32_e32 v46, v49
 ; VI-NEXT:    s_cbranch_vccnz .LBB75_5
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
@@ -132463,7 +132432,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x300, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
@@ -132472,8 +132441,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 0x3000000, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x3000000, v1
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -132486,8 +132455,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x3000000, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -132562,29 +132531,29 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -132596,8 +132565,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -132609,8 +132578,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -132622,8 +132591,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v15, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -132634,8 +132603,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v16, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -132645,8 +132614,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
@@ -132657,8 +132626,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -132668,63 +132637,63 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v20, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -132734,57 +132703,54 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v45
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v47
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v47
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v44
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v42
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v58
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x3000000, v0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v63
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x3000000, v0
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v41
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v40
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x300, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -133069,7 +133035,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:164
 ; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:172
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:188
 ; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:212
@@ -133077,11 +133043,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:252
 ; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:260
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:292
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:300
 ; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:308
@@ -133107,7 +133073,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
@@ -133282,7 +133248,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -133294,7 +133260,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v57
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v60
 ; GFX9-NEXT:    v_mov_b32_e32 v52, v56
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -133303,14 +133269,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -133320,12 +133286,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v51, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -133379,7 +133345,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:  .LBB75_2:
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
@@ -133741,12 +133707,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v45
-; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x300, v0
 ; GFX9-NEXT:    v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v0, 3, v44
+; GFX9-NEXT:    v_add_u32_e32 v0, 3, v40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v55
@@ -133756,7 +133722,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v43
-; GFX9-NEXT:    v_add_u32_e32 v1, 3, v42
+; GFX9-NEXT:    v_add_u32_e32 v1, 3, v36
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x300, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -133765,7 +133731,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v0, 3, v36
+; GFX9-NEXT:    v_add_u32_e32 v0, 3, v42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v41
@@ -136698,20 +136664,19 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_writelane_b32 v62, s46, 3
 ; SI-NEXT:    s_cbranch_execnz .LBB77_4
 ; SI-NEXT:  .LBB77_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[11:12], s[8:9], 1.0
+; SI-NEXT:    v_add_f64 v[19:20], s[12:13], 1.0
 ; SI-NEXT:    v_add_f64 v[3:4], s[6:7], 1.0
-; SI-NEXT:    v_add_f64 v[15:16], s[10:11], 1.0
+; SI-NEXT:    v_add_f64 v[1:2], s[22:23], 1.0
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v11
-; SI-NEXT:    v_add_f64 v[1:2], s[22:23], 1.0
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v20
 ; SI-NEXT:    v_add_f64 v[41:42], s[24:25], 1.0
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v19
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; SI-NEXT:    v_and_b32_e32 v54, 0xffff0000, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v42
@@ -136740,7 +136705,8 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_add_f64 v[31:32], s[42:43], 1.0
 ; SI-NEXT:    v_add_f64 v[27:28], s[40:41], 1.0
 ; SI-NEXT:    v_add_f64 v[23:24], s[14:15], 1.0
-; SI-NEXT:    v_add_f64 v[19:20], s[12:13], 1.0
+; SI-NEXT:    v_add_f64 v[15:16], s[10:11], 1.0
+; SI-NEXT:    v_add_f64 v[11:12], s[8:9], 1.0
 ; SI-NEXT:    v_add_f64 v[7:8], s[4:5], 1.0
 ; SI-NEXT:    v_add_f64 v[59:60], s[18:19], 1.0
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v8
@@ -136749,27 +136715,27 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v11
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v16
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v24
 ; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v28
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v28
 ; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v32
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v32
 ; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT:    v_and_b32_e32 v34, 0xffff0000, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v36
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v36
 ; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v35
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
 ; SI-NEXT:    v_and_b32_e32 v38, 0xffff0000, v50
@@ -136860,14 +136826,14 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    ; kill: killed $sgpr46
 ; SI-NEXT:    s_branch .LBB77_2
 ; SI-NEXT:  .LBB77_4:
-; SI-NEXT:    v_mov_b32_e32 v1, s85
+; SI-NEXT:    v_mov_b32_e32 v1, s71
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s83
+; SI-NEXT:    v_mov_b32_e32 v1, s69
 ; SI-NEXT:    v_readlane_b32 s4, v62, 0
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s82
+; SI-NEXT:    v_mov_b32_e32 v1, s68
 ; SI-NEXT:    v_mov_b32_e32 v61, s4
 ; SI-NEXT:    v_readlane_b32 s4, v62, 1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -136886,27 +136852,27 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_mov_b32_e32 v14, s96
 ; SI-NEXT:    v_mov_b32_e32 v21, s87
 ; SI-NEXT:    v_mov_b32_e32 v18, s86
+; SI-NEXT:    v_mov_b32_e32 v25, s85
 ; SI-NEXT:    v_mov_b32_e32 v22, s84
-; SI-NEXT:    v_mov_b32_e32 v16, s81
-; SI-NEXT:    v_mov_b32_e32 v15, s80
-; SI-NEXT:    v_mov_b32_e32 v12, s71
-; SI-NEXT:    v_mov_b32_e32 v11, s70
-; SI-NEXT:    v_mov_b32_e32 v20, s69
-; SI-NEXT:    v_mov_b32_e32 v19, s68
+; SI-NEXT:    v_mov_b32_e32 v29, s83
+; SI-NEXT:    v_mov_b32_e32 v26, s82
+; SI-NEXT:    v_mov_b32_e32 v33, s81
+; SI-NEXT:    v_mov_b32_e32 v30, s80
+; SI-NEXT:    v_mov_b32_e32 v34, s70
 ; SI-NEXT:    v_mov_b32_e32 v8, s67
 ; SI-NEXT:    v_mov_b32_e32 v7, s66
 ; SI-NEXT:    v_mov_b32_e32 v24, s65
 ; SI-NEXT:    v_mov_b32_e32 v23, s64
-; SI-NEXT:    v_mov_b32_e32 v26, s55
-; SI-NEXT:    v_mov_b32_e32 v25, s54
+; SI-NEXT:    v_mov_b32_e32 v16, s55
+; SI-NEXT:    v_mov_b32_e32 v15, s54
 ; SI-NEXT:    v_mov_b32_e32 v28, s53
 ; SI-NEXT:    v_mov_b32_e32 v27, s52
-; SI-NEXT:    v_mov_b32_e32 v30, s51
-; SI-NEXT:    v_mov_b32_e32 v29, s50
+; SI-NEXT:    v_mov_b32_e32 v12, s51
+; SI-NEXT:    v_mov_b32_e32 v11, s50
 ; SI-NEXT:    v_mov_b32_e32 v32, s49
 ; SI-NEXT:    v_mov_b32_e32 v31, s48
-; SI-NEXT:    v_mov_b32_e32 v34, s39
-; SI-NEXT:    v_mov_b32_e32 v33, s38
+; SI-NEXT:    v_mov_b32_e32 v20, s39
+; SI-NEXT:    v_mov_b32_e32 v19, s38
 ; SI-NEXT:    v_mov_b32_e32 v36, s37
 ; SI-NEXT:    v_mov_b32_e32 v35, s36
 ; SI-NEXT:    v_mov_b32_e32 v38, s35
@@ -137039,9 +137005,9 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v34
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v33
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v19
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -137053,9 +137019,9 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v29
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v11
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -137067,9 +137033,9 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v15
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -137088,27 +137054,6 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v19
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v11
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v15
-; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_readlane_b32 s99, v63, 35
@@ -137153,15 +137098,36 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v30
+; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v26
+; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v22
+; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -140475,10 +140441,10 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v36
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s18
 ; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s22
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
@@ -140507,9 +140473,9 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB79_4
@@ -140520,13 +140486,12 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v5, v5, v8, 16
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v3, v3, v35, 16
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_alignbit_b32 v1, v1, v35, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_alignbit_b32 v4, v4, v9, 16
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_mov_b32_e32 v59, v2
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v2, 16
@@ -140536,11 +140501,10 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_mov_b32_e32 v47, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_mov_b32_e32 v45, v12
-; SI-NEXT:    v_mov_b32_e32 v44, v14
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_alignbit_b32 v3, v3, v33, 16
+; SI-NEXT:    v_mov_b32_e32 v33, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_alignbit_b32 v1, v1, v33, 16
 ; SI-NEXT:    v_mov_b32_e32 v62, v38
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
@@ -140580,20 +140544,20 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_alignbit_b32 v28, v28, v37, 16
 ; SI-NEXT:    v_mov_b32_e32 v37, v34
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v35, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v43, v8
 ; SI-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v60, v9
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v42, v9
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v32
 ; SI-NEXT:    v_alignbit_b32 v31, v31, v34, 16
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v42, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v60, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    v_alignbit_b32 v8, v8, v9, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
@@ -140617,7 +140581,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_alignbit_b32 v12, v12, v14, 16
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v33, v14
+; SI-NEXT:    v_mov_b32_e32 v44, v14
 ; SI-NEXT:    v_alignbit_b32 v13, v13, v14, 16
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -140640,7 +140604,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:  .LBB79_2: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
@@ -140656,7 +140620,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v35
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v42
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v60
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v57
@@ -140668,7 +140632,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v45
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v33
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v36
@@ -140780,7 +140744,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
@@ -140803,7 +140767,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v43
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v42
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_alignbit_b32 v8, v9, v8, 16
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v58
@@ -140818,7 +140782,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; SI-NEXT:    v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v44
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v62
@@ -140904,16 +140868,16 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB79_4:
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v61, v53
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
@@ -140922,7 +140886,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; SI-NEXT:    v_mov_b32_e32 v57, v11
 ; SI-NEXT:    v_mov_b32_e32 v47, v10
 ; SI-NEXT:    v_mov_b32_e32 v45, v12
-; SI-NEXT:    v_mov_b32_e32 v44, v14
+; SI-NEXT:    v_mov_b32_e32 v33, v14
 ; SI-NEXT:    v_mov_b32_e32 v62, v38
 ; SI-NEXT:    v_mov_b32_e32 v38, v39
 ; SI-NEXT:    v_mov_b32_e32 v39, v41
@@ -144889,68 +144853,70 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    s_lshr_b32 s46, s5, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, s46
 ; SI-NEXT:    s_lshr_b32 s46, s4, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, s46
 ; SI-NEXT:    s_lshr_b32 s46, s7, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, s46
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, s46
 ; SI-NEXT:    s_lshr_b32 s46, s6, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, s46
 ; SI-NEXT:    s_lshr_b32 s46, s9, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, s46
 ; SI-NEXT:    s_lshr_b32 s46, s8, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, s46
 ; SI-NEXT:    s_lshr_b32 s46, s11, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, s46
 ; SI-NEXT:    s_lshr_b32 s46, s10, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, s46
 ; SI-NEXT:    s_lshr_b32 s46, s13, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, s46
 ; SI-NEXT:    s_lshr_b32 s46, s12, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, s46
 ; SI-NEXT:    s_lshr_b32 s46, s15, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, s46
 ; SI-NEXT:    s_lshr_b32 s46, s14, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, s46
 ; SI-NEXT:    s_lshr_b32 s46, s41, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, s46
 ; SI-NEXT:    s_lshr_b32 s46, s40, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, s46
 ; SI-NEXT:    s_lshr_b32 s46, s43, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, s46
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v61, s46
 ; SI-NEXT:    s_lshr_b32 s46, s42, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, s46
 ; SI-NEXT:    s_lshr_b32 s46, s45, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, s46
 ; SI-NEXT:    s_lshr_b32 s46, s44, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, s46
 ; SI-NEXT:    s_lshr_b32 s46, s29, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, s46
 ; SI-NEXT:    s_lshr_b32 s46, s28, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, s46
 ; SI-NEXT:    s_lshr_b32 s46, s27, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, s46
 ; SI-NEXT:    s_lshr_b32 s46, s26, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, s46
 ; SI-NEXT:    s_lshr_b32 s46, s25, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, s46
 ; SI-NEXT:    s_lshr_b32 s46, s24, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, s46
 ; SI-NEXT:    s_lshr_b32 s46, s23, 16
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s45
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, s46
 ; SI-NEXT:    s_lshr_b32 s46, s22, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, s46
 ; SI-NEXT:    s_lshr_b32 s46, s21, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v46, s46
 ; SI-NEXT:    s_lshr_b32 s46, s20, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s46
 ; SI-NEXT:    s_lshr_b32 s46, s19, 16
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s44
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, s46
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, s46
 ; SI-NEXT:    s_lshr_b32 s46, s18, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, s46
 ; SI-NEXT:    s_lshr_b32 s46, s17, 16
@@ -144958,7 +144924,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    s_lshr_b32 s46, s16, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, s46
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, s7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, s6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, s9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, s8
@@ -144966,12 +144933,11 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, s10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, s13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, s12
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, s15
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, s15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, s14
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, s41
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, s40
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, s43
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, s42
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, s41
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, s40
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, s43
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, s29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, s28
@@ -144985,7 +144951,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, s20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, s19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, s18
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, s17
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s17
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s16
 ; SI-NEXT:    s_cbranch_execnz .LBB81_3
@@ -145010,146 +144976,143 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v49
-; SI-NEXT:    v_add_f64 v[14:15], s[10:11], 1.0
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v36
+; SI-NEXT:    v_add_f64 v[14:15], s[10:11], 1.0
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f64 v[29:30], s[42:43], 1.0
-; SI-NEXT:    v_add_f64 v[6:7], s[6:7], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v24
-; SI-NEXT:    v_add_f64 v[25:26], s[40:41], 1.0
-; SI-NEXT:    v_add_f64 v[33:34], s[44:45], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v52
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v37
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f64 v[6:7], s[6:7], 1.0
 ; SI-NEXT:    v_add_f64 v[4:5], s[4:5], 1.0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v34
-; SI-NEXT:    v_add_f64 v[21:22], s[14:15], 1.0
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v14
+; SI-NEXT:    v_add_f64 v[29:30], s[42:43], 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v29
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v24
+; SI-NEXT:    v_add_f64 v[10:11], s[8:9], 1.0
+; SI-NEXT:    v_add_f64 v[25:26], s[40:41], 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v52
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; SI-NEXT:    v_add_f64 v[21:22], s[14:15], 1.0
+; SI-NEXT:    v_add_f64 v[33:34], s[44:45], 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v21
 ; SI-NEXT:    v_add_f64 v[18:19], s[12:13], 1.0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v30
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v1
 ; SI-NEXT:    v_add_f64 v[1:2], s[18:19], 1.0
+; SI-NEXT:    v_cvt_f32_f16_e32 v61, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v62, v62
 ; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v25
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v34
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, v1
 ; SI-NEXT:    v_add_f64 v[1:2], s[16:17], 1.0
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v28
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f64 v[10:11], s[8:9], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v62, v62
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, v7
+; SI-NEXT:    v_mov_b32_e32 v7, v61
+; SI-NEXT:    v_mov_b32_e32 v61, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v62, v19
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v57
 ; SI-NEXT:    v_add_f64 v[46:47], s[20:21], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v19
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v46
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v47
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v46
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v15
-; SI-NEXT:    v_mov_b32_e32 v15, v45
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v18
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v18, v12
-; SI-NEXT:    v_mov_b32_e32 v12, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v62, v19
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v47
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v56
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, v60
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v15
+; SI-NEXT:    v_mov_b32_e32 v15, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v14
+; SI-NEXT:    v_mov_b32_e32 v14, v12
+; SI-NEXT:    v_mov_b32_e32 v12, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v57
-; SI-NEXT:    v_mov_b32_e32 v57, v3
+; SI-NEXT:    v_mov_b32_e32 v18, v3
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v13
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, v13
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; SI-NEXT:  .LBB81_3: ; %end
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v62
@@ -145160,7 +145123,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v60
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v63
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
@@ -145173,13 +145136,13 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v34
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v47
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -145193,28 +145156,28 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v21
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v30
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 28, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v26
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v53
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 36, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -145242,7 +145205,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v28
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v37
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -145264,111 +145227,111 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v6
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v12
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v6
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v63
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x44, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v56
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v57
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v44
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x48, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v45
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x4c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v43
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x50, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v41
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x54, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v51
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x58, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v39
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x5c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v19
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v11
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v18
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v56
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -145404,24 +145367,24 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr20
 ; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr49
@@ -145429,39 +145392,39 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr55
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr3
@@ -149735,24 +149698,24 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v3
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
@@ -149773,23 +149736,23 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v38
 ; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v36
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v34
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
@@ -149802,46 +149765,45 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB87_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v7, v0, v58
+; SI-NEXT:    v_or_b32_e32 v7, v0, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v9, v0, v61
+; SI-NEXT:    v_or_b32_e32 v9, v0, v50
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v10, v0, v50
+; SI-NEXT:    v_or_b32_e32 v10, v0, v43
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v57
-; SI-NEXT:    v_or_b32_e32 v11, v0, v45
+; SI-NEXT:    v_or_b32_e32 v11, v0, v41
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v56
-; SI-NEXT:    v_or_b32_e32 v12, v0, v41
+; SI-NEXT:    v_or_b32_e32 v12, v0, v40
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v49
-; SI-NEXT:    v_mov_b32_e32 v36, v41
-; SI-NEXT:    v_mov_b32_e32 v41, v13
+; SI-NEXT:    v_mov_b32_e32 v52, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v0, v13
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
-; SI-NEXT:    v_mov_b32_e32 v51, v50
-; SI-NEXT:    v_mov_b32_e32 v50, v45
-; SI-NEXT:    v_mov_b32_e32 v45, v14
-; SI-NEXT:    v_or_b32_e32 v14, v0, v40
+; SI-NEXT:    v_mov_b32_e32 v36, v41
+; SI-NEXT:    v_mov_b32_e32 v41, v14
+; SI-NEXT:    v_or_b32_e32 v14, v0, v48
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v16
-; SI-NEXT:    v_mov_b32_e32 v52, v57
-; SI-NEXT:    v_mov_b32_e32 v57, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v15
+; SI-NEXT:    v_mov_b32_e32 v51, v50
+; SI-NEXT:    v_mov_b32_e32 v50, v43
+; SI-NEXT:    v_mov_b32_e32 v43, v48
+; SI-NEXT:    v_mov_b32_e32 v48, v15
 ; SI-NEXT:    v_or_b32_e32 v15, v0, v15
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v18
-; SI-NEXT:    v_mov_b32_e32 v34, v58
-; SI-NEXT:    v_mov_b32_e32 v58, v61
+; SI-NEXT:    v_mov_b32_e32 v38, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v16
 ; SI-NEXT:    v_or_b32_e32 v16, v0, v37
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v20
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_or_b32_e32 v17, v0, v17
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v22
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -149875,7 +149837,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_and_b32 s6, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s7, s21, 16
 ; SI-NEXT:    v_or_b32_e32 v26, v0, v26
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v59
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v45
 ; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_and_b32 s7, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s8, s23, 16
@@ -149886,7 +149848,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    v_mov_b32_e32 v33, v28
 ; SI-NEXT:    v_or_b32_e32 v28, v0, v5
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v43
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v59
 ; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s27, 16
@@ -149898,7 +149860,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_lshl_b32 s11, s29, 16
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v30, v0, v3
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v38
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v58
 ; SI-NEXT:    s_or_b32 s10, s10, s11
 ; SI-NEXT:    v_mov_b32_e32 v63, v2
 ; SI-NEXT:    v_mov_b32_e32 v32, v55
@@ -149906,9 +149868,9 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v55, v4
 ; SI-NEXT:    v_mov_b32_e32 v53, v6
 ; SI-NEXT:    v_mov_b32_e32 v47, v46
-; SI-NEXT:    v_mov_b32_e32 v59, v44
-; SI-NEXT:    v_mov_b32_e32 v43, v42
-; SI-NEXT:    v_or_b32_e32 v31, v0, v48
+; SI-NEXT:    v_mov_b32_e32 v45, v44
+; SI-NEXT:    v_mov_b32_e32 v59, v42
+; SI-NEXT:    v_or_b32_e32 v31, v0, v34
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
@@ -149918,13 +149880,12 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v6, s10
 ; SI-NEXT:    s_cbranch_execnz .LBB87_3
 ; SI-NEXT:  .LBB87_2: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v63
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v32, v1
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x30000, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v38, v43
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
@@ -149968,42 +149929,42 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v34, v0
+; SI-NEXT:    v_or_b32_e32 v0, v38, v0
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v58, v0
+; SI-NEXT:    v_or_b32_e32 v0, v51, v0
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v53
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v51, v0
+; SI-NEXT:    v_or_b32_e32 v0, v50, v0
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v52
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v50, v0
+; SI-NEXT:    v_or_b32_e32 v0, v36, v0
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v36, v0
+; SI-NEXT:    v_or_b32_e32 v0, v57, v0
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v57
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v40
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v41, v0
+; SI-NEXT:    v_or_b32_e32 v0, v49, v0
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v45
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v49, v0
+; SI-NEXT:    v_or_b32_e32 v0, v43, v0
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v56
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v40, v0
+; SI-NEXT:    v_or_b32_e32 v0, v48, v0
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 0x30000, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v37
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -150065,7 +150026,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, 0x30000, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v47
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -150081,12 +150042,12 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v59
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v45
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 0x30000, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
@@ -150095,7 +150056,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 0x30000, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v38
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v59
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -150130,26 +150091,26 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB87_4:
-; SI-NEXT:    v_mov_b32_e32 v34, v58
+; SI-NEXT:    v_mov_b32_e32 v38, v61
 ; SI-NEXT:    v_mov_b32_e32 v32, v55
-; SI-NEXT:    v_mov_b32_e32 v58, v61
 ; SI-NEXT:    v_mov_b32_e32 v63, v2
 ; SI-NEXT:    v_mov_b32_e32 v55, v4
 ; SI-NEXT:    v_mov_b32_e32 v53, v6
 ; SI-NEXT:    v_mov_b32_e32 v52, v57
 ; SI-NEXT:    v_mov_b32_e32 v51, v50
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
-; SI-NEXT:    v_mov_b32_e32 v50, v45
+; SI-NEXT:    v_mov_b32_e32 v50, v43
 ; SI-NEXT:    v_mov_b32_e32 v36, v41
-; SI-NEXT:    v_mov_b32_e32 v41, v13
-; SI-NEXT:    v_mov_b32_e32 v57, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v15
-; SI-NEXT:    v_mov_b32_e32 v45, v14
+; SI-NEXT:    v_mov_b32_e32 v57, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v13
+; SI-NEXT:    v_mov_b32_e32 v43, v48
+; SI-NEXT:    v_mov_b32_e32 v48, v15
+; SI-NEXT:    v_mov_b32_e32 v41, v14
 ; SI-NEXT:    v_mov_b32_e32 v56, v16
 ; SI-NEXT:    v_mov_b32_e32 v47, v46
-; SI-NEXT:    v_mov_b32_e32 v59, v44
-; SI-NEXT:    v_mov_b32_e32 v43, v42
+; SI-NEXT:    v_mov_b32_e32 v45, v44
+; SI-NEXT:    v_mov_b32_e32 v59, v42
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v39, v23
@@ -157056,7 +157017,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
@@ -157067,18 +157027,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:312
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
-; SI-NEXT:    ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_writelane_b32 v41, s30, 0
+; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; SI-NEXT:    s_mov_b32 s72, s21
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_writelane_b32 v44, s29, 0
-; SI-NEXT:    v_writelane_b32 v44, s28, 1
-; SI-NEXT:    v_writelane_b32 v44, s27, 2
-; SI-NEXT:    v_writelane_b32 v44, s26, 3
-; SI-NEXT:    v_writelane_b32 v44, s19, 4
-; SI-NEXT:    v_writelane_b32 v44, s18, 5
-; SI-NEXT:    v_writelane_b32 v44, s17, 6
-; SI-NEXT:    v_writelane_b32 v44, s16, 7
+; SI-NEXT:    v_writelane_b32 v43, s19, 0
+; SI-NEXT:    v_writelane_b32 v43, s18, 1
+; SI-NEXT:    v_writelane_b32 v43, s17, 2
+; SI-NEXT:    v_writelane_b32 v43, s16, 3
+; SI-NEXT:    s_mov_b32 s60, s24
+; SI-NEXT:    v_writelane_b32 v41, s30, 0
 ; SI-NEXT:    v_writelane_b32 v41, s31, 1
 ; SI-NEXT:    v_writelane_b32 v41, s34, 2
 ; SI-NEXT:    v_writelane_b32 v41, s35, 3
@@ -157102,6 +157059,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s69, 21
 ; SI-NEXT:    v_writelane_b32 v41, s70, 22
 ; SI-NEXT:    v_writelane_b32 v41, s71, 23
+; SI-NEXT:    s_mov_b32 s77, s28
+; SI-NEXT:    s_mov_b32 s76, s27
 ; SI-NEXT:    v_writelane_b32 v41, s80, 24
 ; SI-NEXT:    v_writelane_b32 v41, s81, 25
 ; SI-NEXT:    v_writelane_b32 v41, s82, 26
@@ -157112,92 +157071,100 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s96, 32
 ; SI-NEXT:    v_writelane_b32 v41, s97, 33
-; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT:    v_readfirstlane_b32 s30, v16
-; SI-NEXT:    v_readfirstlane_b32 s31, v15
-; SI-NEXT:    v_readfirstlane_b32 s34, v21
-; SI-NEXT:    v_readfirstlane_b32 s35, v22
-; SI-NEXT:    v_readfirstlane_b32 s36, v20
-; SI-NEXT:    v_readfirstlane_b32 s37, v19
-; SI-NEXT:    v_readfirstlane_b32 s38, v25
-; SI-NEXT:    v_readfirstlane_b32 s39, v26
-; SI-NEXT:    v_readfirstlane_b32 s48, v24
-; SI-NEXT:    v_readfirstlane_b32 s49, v23
-; SI-NEXT:    v_readfirstlane_b32 s50, v29
-; SI-NEXT:    v_readfirstlane_b32 s51, v30
-; SI-NEXT:    v_readfirstlane_b32 s52, v28
-; SI-NEXT:    v_readfirstlane_b32 s53, v27
 ; SI-NEXT:    v_writelane_b32 v41, s98, 34
 ; SI-NEXT:    v_writelane_b32 v41, s99, 35
-; SI-NEXT:    s_mov_b32 s6, s21
+; SI-NEXT:    s_mov_b32 s79, s26
+; SI-NEXT:    v_readfirstlane_b32 s38, v20
+; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; SI-NEXT:    v_readfirstlane_b32 s39, v19
+; SI-NEXT:    v_writelane_b32 v42, s38, 0
+; SI-NEXT:    v_readfirstlane_b32 s48, v25
+; SI-NEXT:    v_writelane_b32 v42, s39, 1
+; SI-NEXT:    v_readfirstlane_b32 s49, v26
+; SI-NEXT:    v_writelane_b32 v42, s48, 2
+; SI-NEXT:    v_readfirstlane_b32 s50, v24
+; SI-NEXT:    v_writelane_b32 v42, s49, 3
+; SI-NEXT:    v_readfirstlane_b32 s51, v23
+; SI-NEXT:    v_writelane_b32 v42, s50, 4
+; SI-NEXT:    v_readfirstlane_b32 s52, v29
+; SI-NEXT:    v_writelane_b32 v42, s51, 5
+; SI-NEXT:    v_readfirstlane_b32 s53, v30
+; SI-NEXT:    v_writelane_b32 v42, s52, 6
+; SI-NEXT:    v_readfirstlane_b32 s54, v28
+; SI-NEXT:    v_writelane_b32 v42, s53, 7
+; SI-NEXT:    v_readfirstlane_b32 s55, v27
+; SI-NEXT:    v_writelane_b32 v42, s54, 8
+; SI-NEXT:    v_writelane_b32 v42, s55, 9
 ; SI-NEXT:    v_readfirstlane_b32 s16, v1
 ; SI-NEXT:    v_readfirstlane_b32 s17, v2
 ; SI-NEXT:    v_readfirstlane_b32 s18, v5
 ; SI-NEXT:    v_readfirstlane_b32 s19, v6
-; SI-NEXT:    v_readfirstlane_b32 s78, v4
-; SI-NEXT:    v_readfirstlane_b32 s79, v3
-; SI-NEXT:    v_readfirstlane_b32 s88, v9
-; SI-NEXT:    v_readfirstlane_b32 s89, v10
-; SI-NEXT:    v_readfirstlane_b32 s90, v8
-; SI-NEXT:    v_readfirstlane_b32 s91, v7
-; SI-NEXT:    v_readfirstlane_b32 s92, v13
-; SI-NEXT:    v_readfirstlane_b32 s93, v14
-; SI-NEXT:    v_readfirstlane_b32 s94, v12
+; SI-NEXT:    v_readfirstlane_b32 s88, v4
+; SI-NEXT:    v_readfirstlane_b32 s89, v3
+; SI-NEXT:    v_readfirstlane_b32 s90, v9
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s6, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:296
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT:    v_writelane_b32 v44, s4, 8
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 9
+; SI-NEXT:    v_writelane_b32 v43, s4, 4
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
-; SI-NEXT:    v_writelane_b32 v44, s4, 10
+; SI-NEXT:    v_writelane_b32 v43, s4, 5
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 11
+; SI-NEXT:    v_writelane_b32 v43, s4, 6
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 12
+; SI-NEXT:    v_writelane_b32 v43, s4, 7
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 13
+; SI-NEXT:    v_writelane_b32 v43, s4, 8
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
-; SI-NEXT:    v_writelane_b32 v44, s4, 14
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
+; SI-NEXT:    v_writelane_b32 v43, s4, 9
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 15
-; SI-NEXT:    v_readfirstlane_b32 s95, v11
+; SI-NEXT:    v_writelane_b32 v43, s4, 10
+; SI-NEXT:    v_readfirstlane_b32 s91, v10
+; SI-NEXT:    v_readfirstlane_b32 s92, v8
+; SI-NEXT:    v_readfirstlane_b32 s93, v7
+; SI-NEXT:    v_readfirstlane_b32 s94, v13
+; SI-NEXT:    v_readfirstlane_b32 s95, v14
+; SI-NEXT:    v_readfirstlane_b32 s30, v17
+; SI-NEXT:    v_readfirstlane_b32 s31, v18
+; SI-NEXT:    v_readfirstlane_b32 s34, v16
+; SI-NEXT:    v_readfirstlane_b32 s35, v15
+; SI-NEXT:    v_readfirstlane_b32 s36, v21
+; SI-NEXT:    v_readfirstlane_b32 s37, v22
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 16
+; SI-NEXT:    v_writelane_b32 v43, s4, 11
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 17
+; SI-NEXT:    v_writelane_b32 v43, s4, 12
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 18
+; SI-NEXT:    v_writelane_b32 v43, s4, 13
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 19
+; SI-NEXT:    v_writelane_b32 v43, s4, 14
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 20
+; SI-NEXT:    v_writelane_b32 v43, s4, 15
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
-; SI-NEXT:    v_writelane_b32 v44, s4, 21
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:252
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:248
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:244
@@ -157206,49 +157173,39 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:228
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 22
+; SI-NEXT:    v_readfirstlane_b32 s75, v32
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    v_readfirstlane_b32 s61, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_writelane_b32 v44, s4, 23
+; SI-NEXT:    v_writelane_b32 v43, s4, 16
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 24
+; SI-NEXT:    v_readfirstlane_b32 s43, v34
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 25
+; SI-NEXT:    v_readfirstlane_b32 s40, v35
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 26
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s4, v37
-; SI-NEXT:    v_writelane_b32 v44, s4, 27
+; SI-NEXT:    v_readfirstlane_b32 s63, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:204
+; SI-NEXT:    v_writelane_b32 v43, s4, 17
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 28
+; SI-NEXT:    v_readfirstlane_b32 s59, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 29
+; SI-NEXT:    v_readfirstlane_b32 s42, v38
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 30
+; SI-NEXT:    v_readfirstlane_b32 s73, v39
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 31
+; SI-NEXT:    v_readfirstlane_b32 s21, v48
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 32
+; SI-NEXT:    v_readfirstlane_b32 s57, v49
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 33
+; SI-NEXT:    v_readfirstlane_b32 s13, v50
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s4, v51
+; SI-NEXT:    v_readfirstlane_b32 s45, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:200
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:192
@@ -157256,43 +157213,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:184
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s47, v32
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s58, v33
+; SI-NEXT:    v_readfirstlane_b32 s24, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT:    v_readfirstlane_b32 s26, v32
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s77, v34
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s63, v35
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s57, v36
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s56, v37
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT:    v_writelane_b32 v44, s4, 34
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_readfirstlane_b32 s78, v34
+; SI-NEXT:    v_readfirstlane_b32 s4, v35
+; SI-NEXT:    v_writelane_b32 v43, s4, 18
+; SI-NEXT:    v_readfirstlane_b32 s4, v36
+; SI-NEXT:    v_writelane_b32 v43, s4, 19
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_readfirstlane_b32 s4, v37
+; SI-NEXT:    v_writelane_b32 v43, s4, 20
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s61, v31
+; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_writelane_b32 v43, s4, 21
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s74, v38
+; SI-NEXT:    v_readfirstlane_b32 s4, v38
+; SI-NEXT:    v_writelane_b32 v43, s4, 22
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s76, v39
+; SI-NEXT:    v_readfirstlane_b32 s4, v39
+; SI-NEXT:    v_writelane_b32 v43, s4, 23
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s47, v48
+; SI-NEXT:    v_readfirstlane_b32 s4, v48
+; SI-NEXT:    v_writelane_b32 v43, s4, 24
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s45, v49
+; SI-NEXT:    v_readfirstlane_b32 s4, v49
+; SI-NEXT:    v_writelane_b32 v43, s4, 25
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s60, v50
+; SI-NEXT:    v_readfirstlane_b32 s4, v50
+; SI-NEXT:    v_writelane_b32 v43, s4, 26
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s42, v51
+; SI-NEXT:    v_readfirstlane_b32 s4, v51
+; SI-NEXT:    v_writelane_b32 v43, s4, 27
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s13, v33
+; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:132
@@ -157304,43 +157269,43 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
+; SI-NEXT:    v_writelane_b32 v43, s4, 28
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s72, v52
-; SI-NEXT:    v_readfirstlane_b32 s73, v53
-; SI-NEXT:    v_readfirstlane_b32 s44, v55
+; SI-NEXT:    v_readfirstlane_b32 s4, v52
+; SI-NEXT:    v_writelane_b32 v43, s4, 29
+; SI-NEXT:    v_readfirstlane_b32 s4, v53
+; SI-NEXT:    v_writelane_b32 v43, s4, 30
+; SI-NEXT:    v_readfirstlane_b32 s4, v54
+; SI-NEXT:    v_writelane_b32 v43, s4, 31
+; SI-NEXT:    v_readfirstlane_b32 s4, v55
+; SI-NEXT:    v_writelane_b32 v43, s4, 32
 ; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v40
-; SI-NEXT:    v_writelane_b32 v44, s4, 35
-; SI-NEXT:    v_writelane_b32 v44, s22, 36
-; SI-NEXT:    v_writelane_b32 v44, s23, 37
-; SI-NEXT:    v_writelane_b32 v44, s6, 38
-; SI-NEXT:    v_writelane_b32 v44, s20, 39
-; SI-NEXT:    v_writelane_b32 v44, s25, 40
-; SI-NEXT:    v_writelane_b32 v44, s24, 41
-; SI-NEXT:    v_writelane_b32 v44, s44, 42
-; SI-NEXT:    v_writelane_b32 v44, s72, 43
-; SI-NEXT:    v_writelane_b32 v44, s13, 44
-; SI-NEXT:    v_writelane_b32 v44, s60, 45
-; SI-NEXT:    v_writelane_b32 v44, s73, 46
-; SI-NEXT:    v_readfirstlane_b32 s21, v54
-; SI-NEXT:    v_writelane_b32 v44, s42, 47
-; SI-NEXT:    v_writelane_b32 v44, s21, 48
-; SI-NEXT:    v_writelane_b32 v44, s16, 49
-; SI-NEXT:    v_writelane_b32 v44, s17, 50
-; SI-NEXT:    v_writelane_b32 v44, s18, 51
-; SI-NEXT:    v_writelane_b32 v44, s19, 52
-; SI-NEXT:    v_writelane_b32 v44, s78, 53
-; SI-NEXT:    v_writelane_b32 v44, s79, 54
-; SI-NEXT:    v_writelane_b32 v44, s88, 55
-; SI-NEXT:    v_writelane_b32 v44, s89, 56
-; SI-NEXT:    v_writelane_b32 v44, s90, 57
-; SI-NEXT:    v_writelane_b32 v44, s91, 58
-; SI-NEXT:    v_writelane_b32 v44, s92, 59
-; SI-NEXT:    v_writelane_b32 v44, s93, 60
-; SI-NEXT:    v_writelane_b32 v44, s94, 61
-; SI-NEXT:    v_writelane_b32 v44, s95, 62
+; SI-NEXT:    v_writelane_b32 v43, s4, 33
+; SI-NEXT:    v_writelane_b32 v43, s22, 34
+; SI-NEXT:    v_writelane_b32 v43, s23, 35
+; SI-NEXT:    v_writelane_b32 v43, s72, 36
+; SI-NEXT:    v_writelane_b32 v43, s20, 37
+; SI-NEXT:    v_writelane_b32 v43, s79, 38
+; SI-NEXT:    v_writelane_b32 v43, s76, 39
+; SI-NEXT:    v_writelane_b32 v43, s25, 40
+; SI-NEXT:    v_writelane_b32 v43, s60, 41
+; SI-NEXT:    v_writelane_b32 v43, s29, 42
+; SI-NEXT:    v_writelane_b32 v43, s77, 43
+; SI-NEXT:    v_writelane_b32 v43, s16, 44
+; SI-NEXT:    v_writelane_b32 v43, s17, 45
+; SI-NEXT:    v_writelane_b32 v43, s18, 46
+; SI-NEXT:    v_writelane_b32 v43, s19, 47
+; SI-NEXT:    v_writelane_b32 v43, s88, 48
+; SI-NEXT:    v_writelane_b32 v43, s89, 49
+; SI-NEXT:    v_writelane_b32 v43, s90, 50
+; SI-NEXT:    v_writelane_b32 v43, s91, 51
+; SI-NEXT:    v_writelane_b32 v43, s92, 52
+; SI-NEXT:    v_writelane_b32 v43, s93, 53
+; SI-NEXT:    v_writelane_b32 v43, s94, 54
+; SI-NEXT:    v_writelane_b32 v43, s95, 55
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s40, v33
+; SI-NEXT:    v_readfirstlane_b32 s62, v33
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s10, v34
 ; SI-NEXT:    s_waitcnt vmcnt(8)
@@ -157348,13 +157313,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s28, v31
 ; SI-NEXT:    v_readfirstlane_b32 s27, v32
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s29, v36
+; SI-NEXT:    v_readfirstlane_b32 s58, v36
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s70, v37
+; SI-NEXT:    v_readfirstlane_b32 s69, v37
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s14, v38
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_readfirstlane_b32 s69, v39
+; SI-NEXT:    v_readfirstlane_b32 s68, v39
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
@@ -157369,50 +157334,42 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s11, v49
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s71, v50
+; SI-NEXT:    v_readfirstlane_b32 s70, v50
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s81, v51
+; SI-NEXT:    v_readfirstlane_b32 s71, v51
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 vcc_hi, v18
-; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 0
-; SI-NEXT:    v_writelane_b32 v43, s30, 1
-; SI-NEXT:    v_writelane_b32 v43, s31, 2
-; SI-NEXT:    v_writelane_b32 v43, s34, 3
-; SI-NEXT:    v_writelane_b32 v43, s35, 4
-; SI-NEXT:    v_writelane_b32 v43, s36, 5
-; SI-NEXT:    v_writelane_b32 v43, s37, 6
-; SI-NEXT:    v_writelane_b32 v43, s38, 7
-; SI-NEXT:    v_writelane_b32 v43, s39, 8
-; SI-NEXT:    v_writelane_b32 v43, s48, 9
-; SI-NEXT:    v_writelane_b32 v43, s49, 10
-; SI-NEXT:    v_writelane_b32 v43, s50, 11
-; SI-NEXT:    v_writelane_b32 v43, s51, 12
-; SI-NEXT:    v_writelane_b32 v43, s52, 13
-; SI-NEXT:    v_writelane_b32 v43, s53, 14
-; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v17
-; SI-NEXT:    v_writelane_b32 v44, vcc_lo, 63
+; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v12
+; SI-NEXT:    v_readfirstlane_b32 vcc_hi, v11
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 56
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 57
+; SI-NEXT:    v_writelane_b32 v43, s30, 58
+; SI-NEXT:    v_writelane_b32 v43, s31, 59
+; SI-NEXT:    v_writelane_b32 v43, s34, 60
+; SI-NEXT:    v_writelane_b32 v43, s35, 61
+; SI-NEXT:    v_writelane_b32 v43, s36, 62
+; SI-NEXT:    v_writelane_b32 v43, s37, 63
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s46, v31
+; SI-NEXT:    v_readfirstlane_b32 s74, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s59, v32
+; SI-NEXT:    v_readfirstlane_b32 s46, v32
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s83, v33
+; SI-NEXT:    v_readfirstlane_b32 s96, v33
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s12, v34
+; SI-NEXT:    v_readfirstlane_b32 s98, v34
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s97, v35
+; SI-NEXT:    v_readfirstlane_b32 s41, v35
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s8, v36
+; SI-NEXT:    v_readfirstlane_b32 s56, v36
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s84, v37
+; SI-NEXT:    v_readfirstlane_b32 s87, v37
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_readfirstlane_b32 s86, v38
+; SI-NEXT:    v_readfirstlane_b32 s99, v38
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_readfirstlane_b32 s15, v39
+; SI-NEXT:    v_readfirstlane_b32 s81, v39
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
@@ -157422,417 +157379,415 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s62, v48
+; SI-NEXT:    v_readfirstlane_b32 s26, v48
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s96, v49
+; SI-NEXT:    v_readfirstlane_b32 s83, v49
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s7, v50
+; SI-NEXT:    v_readfirstlane_b32 s82, v50
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s80, v51
+; SI-NEXT:    v_readfirstlane_b32 s7, v51
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s41, v31
+; SI-NEXT:    v_readfirstlane_b32 s15, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s98, v32
+; SI-NEXT:    v_readfirstlane_b32 s97, v32
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s99, v33
+; SI-NEXT:    v_readfirstlane_b32 s44, v33
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s9, v34
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s82, v35
+; SI-NEXT:    v_readfirstlane_b32 s80, v35
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s68, v36
+; SI-NEXT:    v_readfirstlane_b32 s86, v36
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s67, v37
+; SI-NEXT:    v_readfirstlane_b32 s85, v37
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_readfirstlane_b32 s85, v38
+; SI-NEXT:    v_readfirstlane_b32 s8, v38
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_readfirstlane_b32 s87, v39
+; SI-NEXT:    v_readfirstlane_b32 s12, v39
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_readfirstlane_b32 s55, v48
+; SI-NEXT:    v_readfirstlane_b32 s65, v48
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_readfirstlane_b32 s54, v49
-; SI-NEXT:    v_writelane_b32 v43, s54, 15
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_readfirstlane_b32 s64, v50
-; SI-NEXT:    v_writelane_b32 v43, s55, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s65, v51
-; SI-NEXT:    v_writelane_b32 v43, s64, 17
-; SI-NEXT:    v_writelane_b32 v43, s65, 18
-; SI-NEXT:    v_writelane_b32 v43, s67, 19
-; SI-NEXT:    v_writelane_b32 v43, s68, 20
-; SI-NEXT:    v_writelane_b32 v43, s84, 21
-; SI-NEXT:    v_writelane_b32 v43, s85, 22
-; SI-NEXT:    v_writelane_b32 v43, s86, 23
-; SI-NEXT:    v_writelane_b32 v43, s87, 24
-; SI-NEXT:    v_writelane_b32 v43, s99, 25
-; SI-NEXT:    v_writelane_b32 v43, s98, 26
-; SI-NEXT:    v_writelane_b32 v43, s96, 27
-; SI-NEXT:    v_writelane_b32 v43, s7, 28
-; SI-NEXT:    v_writelane_b32 v43, s12, 29
-; SI-NEXT:    v_writelane_b32 v43, s83, 30
-; SI-NEXT:    v_writelane_b32 v43, s15, 31
-; SI-NEXT:    v_writelane_b32 v43, s9, 32
-; SI-NEXT:    v_writelane_b32 v43, s97, 33
-; SI-NEXT:    v_writelane_b32 v43, s82, 34
-; SI-NEXT:    v_writelane_b32 v43, s80, 35
-; SI-NEXT:    v_writelane_b32 v43, s8, 36
-; SI-NEXT:    v_writelane_b32 v43, s62, 37
-; SI-NEXT:    v_writelane_b32 v43, s41, 38
-; SI-NEXT:    v_writelane_b32 v43, s14, 39
-; SI-NEXT:    v_writelane_b32 v43, s70, 40
-; SI-NEXT:    v_writelane_b32 v43, s81, 41
-; SI-NEXT:    v_writelane_b32 v43, s71, 42
-; SI-NEXT:    v_writelane_b32 v43, s69, 43
-; SI-NEXT:    v_writelane_b32 v43, s46, 44
-; SI-NEXT:    v_writelane_b32 v43, s59, 45
-; SI-NEXT:    v_writelane_b32 v43, s11, 46
-; SI-NEXT:    v_writelane_b32 v43, s10, 47
-; SI-NEXT:    v_writelane_b32 v43, s40, 48
-; SI-NEXT:    v_writelane_b32 v43, s66, 49
-; SI-NEXT:    v_writelane_b32 v43, s29, 50
-; SI-NEXT:    v_writelane_b32 v43, s28, 51
-; SI-NEXT:    v_writelane_b32 v43, s27, 52
-; SI-NEXT:    v_writelane_b32 v43, s45, 53
-; SI-NEXT:    v_writelane_b32 v43, s47, 54
-; SI-NEXT:    v_writelane_b32 v43, s61, 55
+; SI-NEXT:    v_readfirstlane_b32 s64, v49
+; SI-NEXT:    v_writelane_b32 v42, s64, 10
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_readfirstlane_b32 s67, v50
+; SI-NEXT:    v_writelane_b32 v42, s65, 11
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_readfirstlane_b32 s84, v51
+; SI-NEXT:    v_writelane_b32 v42, s67, 12
+; SI-NEXT:    v_writelane_b32 v42, s84, 13
+; SI-NEXT:    v_writelane_b32 v42, s85, 14
+; SI-NEXT:    v_writelane_b32 v42, s86, 15
+; SI-NEXT:    v_writelane_b32 v42, s87, 16
+; SI-NEXT:    v_writelane_b32 v42, s8, 17
+; SI-NEXT:    v_writelane_b32 v42, s99, 18
+; SI-NEXT:    v_writelane_b32 v42, s12, 19
+; SI-NEXT:    v_writelane_b32 v42, s44, 20
+; SI-NEXT:    v_writelane_b32 v42, s97, 21
+; SI-NEXT:    v_writelane_b32 v42, s83, 22
+; SI-NEXT:    v_writelane_b32 v42, s82, 23
+; SI-NEXT:    v_writelane_b32 v42, s98, 24
+; SI-NEXT:    v_writelane_b32 v42, s96, 25
+; SI-NEXT:    v_writelane_b32 v42, s81, 26
+; SI-NEXT:    v_writelane_b32 v42, s9, 27
+; SI-NEXT:    v_writelane_b32 v42, s41, 28
+; SI-NEXT:    v_writelane_b32 v42, s80, 29
+; SI-NEXT:    v_writelane_b32 v42, s7, 30
+; SI-NEXT:    v_writelane_b32 v42, s56, 31
+; SI-NEXT:    v_writelane_b32 v42, s26, 32
+; SI-NEXT:    v_writelane_b32 v42, s15, 33
+; SI-NEXT:    v_writelane_b32 v42, s14, 34
+; SI-NEXT:    v_writelane_b32 v42, s69, 35
+; SI-NEXT:    v_writelane_b32 v42, s71, 36
+; SI-NEXT:    v_writelane_b32 v42, s70, 37
+; SI-NEXT:    v_writelane_b32 v42, s68, 38
+; SI-NEXT:    v_writelane_b32 v42, s74, 39
+; SI-NEXT:    v_writelane_b32 v42, s46, 40
+; SI-NEXT:    v_writelane_b32 v42, s11, 41
+; SI-NEXT:    v_writelane_b32 v42, s10, 42
+; SI-NEXT:    v_writelane_b32 v42, s62, 43
+; SI-NEXT:    v_writelane_b32 v42, s66, 44
+; SI-NEXT:    v_writelane_b32 v42, s58, 45
+; SI-NEXT:    v_writelane_b32 v42, s28, 46
+; SI-NEXT:    v_writelane_b32 v42, s27, 47
+; SI-NEXT:    v_writelane_b32 v42, s78, 48
+; SI-NEXT:    v_writelane_b32 v42, s24, 49
 ; SI-NEXT:    s_cbranch_scc0 .LBB89_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_readlane_b32 s4, v44, 7
+; SI-NEXT:    v_readlane_b32 s4, v43, 3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    v_readlane_b32 s5, v44, 6
+; SI-NEXT:    v_readlane_b32 s5, v43, 2
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s5, 24
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    v_readlane_b32 s5, v44, 4
-; SI-NEXT:    v_writelane_b32 v42, s4, 0
-; SI-NEXT:    v_readlane_b32 s4, v44, 5
+; SI-NEXT:    v_writelane_b32 v42, s4, 56
+; SI-NEXT:    v_readlane_b32 s4, v43, 1
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    v_readlane_b32 s5, v43, 0
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s5, 24
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_writelane_b32 v42, s4, 1
+; SI-NEXT:    v_writelane_b32 v42, s4, 57
 ; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s6, 8
-; SI-NEXT:    s_or_b32 s43, s4, s5
+; SI-NEXT:    s_lshl_b32 s5, s72, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_and_b32 s5, s22, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_mov_b32 s22, s6
 ; SI-NEXT:    s_lshl_b32 s6, s23, 24
+; SI-NEXT:    v_writelane_b32 v42, s4, 58
 ; SI-NEXT:    s_or_b32 s4, s6, s5
-; SI-NEXT:    s_and_b32 s5, s24, 0xff
+; SI-NEXT:    s_and_b32 s5, s60, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
 ; SI-NEXT:    s_lshl_b32 s6, s25, 24
-; SI-NEXT:    v_writelane_b32 v42, s4, 2
-; SI-NEXT:    s_or_b32 s4, s6, s5
-; SI-NEXT:    v_readlane_b32 s5, v44, 3
-; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    v_readlane_b32 s6, v44, 2
+; SI-NEXT:    v_writelane_b32 v42, s4, 59
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_writelane_b32 v42, s5, 60
+; SI-NEXT:    s_and_b32 s5, s79, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s6, 24
-; SI-NEXT:    s_or_b32 s25, s6, s5
-; SI-NEXT:    v_readlane_b32 s5, v44, 1
-; SI-NEXT:    v_readlane_b32 s6, v44, 0
-; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s6, 8
-; SI-NEXT:    s_or_b32 s24, s5, s6
+; SI-NEXT:    s_lshl_b32 s6, s76, 24
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_writelane_b32 v42, s5, 61
+; SI-NEXT:    s_and_b32 s5, s77, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s29, 8
+; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_and_b32 s6, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_lshl_b32 s16, s17, 24
-; SI-NEXT:    v_writelane_b32 v42, s4, 3
-; SI-NEXT:    s_or_b32 s4, s16, s6
-; SI-NEXT:    s_and_b32 s6, s79, 0xff
+; SI-NEXT:    s_or_b32 s6, s16, s6
+; SI-NEXT:    v_writelane_b32 v42, s6, 62
+; SI-NEXT:    s_and_b32 s6, s89, 0xff
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshl_b32 s16, s78, 24
-; SI-NEXT:    s_or_b32 s5, s16, s6
+; SI-NEXT:    s_lshl_b32 s16, s88, 24
+; SI-NEXT:    s_mov_b32 s4, s47
+; SI-NEXT:    s_or_b32 s47, s16, s6
 ; SI-NEXT:    s_and_b32 s6, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_lshl_b32 s16, s19, 24
-; SI-NEXT:    s_or_b32 s75, s16, s6
-; SI-NEXT:    s_and_b32 s6, s91, 0xff
-; SI-NEXT:    s_lshl_b32 s16, s90, 8
+; SI-NEXT:    s_or_b32 s25, s16, s6
+; SI-NEXT:    s_and_b32 s6, s93, 0xff
+; SI-NEXT:    s_lshl_b32 s16, s92, 8
 ; SI-NEXT:    s_or_b32 s6, s6, s16
-; SI-NEXT:    s_and_b32 s16, s88, 0xff
+; SI-NEXT:    s_and_b32 s16, s90, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
-; SI-NEXT:    s_lshl_b32 s17, s89, 24
-; SI-NEXT:    s_or_b32 s78, s17, s16
-; SI-NEXT:    s_and_b32 s16, s95, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s91, 24
+; SI-NEXT:    s_or_b32 s92, s17, s16
+; SI-NEXT:    s_and_b32 s16, vcc_hi, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
-; SI-NEXT:    s_lshl_b32 s17, s94, 24
-; SI-NEXT:    s_mov_b32 s23, s21
-; SI-NEXT:    s_or_b32 s21, s17, s16
-; SI-NEXT:    s_and_b32 s16, s92, 0xff
+; SI-NEXT:    s_lshl_b32 s17, vcc_lo, 24
+; SI-NEXT:    s_or_b32 s76, s17, s16
+; SI-NEXT:    s_and_b32 s16, s94, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
-; SI-NEXT:    s_lshl_b32 s17, s93, 24
-; SI-NEXT:    s_or_b32 s79, s17, s16
-; SI-NEXT:    s_and_b32 s16, s31, 0xff
-; SI-NEXT:    s_lshl_b32 s17, s30, 8
+; SI-NEXT:    s_lshl_b32 s17, s95, 24
+; SI-NEXT:    s_or_b32 s91, s17, s16
+; SI-NEXT:    s_and_b32 s16, s35, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s34, 8
 ; SI-NEXT:    s_or_b32 s16, s16, s17
-; SI-NEXT:    s_and_b32 s17, vcc_lo, 0xff
+; SI-NEXT:    s_and_b32 s17, s30, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s17, 16
-; SI-NEXT:    s_lshl_b32 s18, vcc_hi, 24
-; SI-NEXT:    s_or_b32 s17, s18, s17
-; SI-NEXT:    v_writelane_b32 v43, s17, 56
-; SI-NEXT:    s_and_b32 s17, s37, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s31, 24
+; SI-NEXT:    s_or_b32 s77, s18, s17
+; SI-NEXT:    s_and_b32 s17, s39, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s17, 16
-; SI-NEXT:    s_lshl_b32 s18, s36, 24
-; SI-NEXT:    s_or_b32 s17, s18, s17
-; SI-NEXT:    v_writelane_b32 v43, s17, 58
-; SI-NEXT:    s_and_b32 s17, s34, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s38, 24
+; SI-NEXT:    s_or_b32 s79, s18, s17
+; SI-NEXT:    s_and_b32 s17, s36, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s17, 16
-; SI-NEXT:    s_lshl_b32 s18, s35, 24
-; SI-NEXT:    s_or_b32 s17, s18, s17
-; SI-NEXT:    v_writelane_b32 v43, s17, 57
-; SI-NEXT:    s_and_b32 s17, s49, 0xff
-; SI-NEXT:    s_lshl_b32 s18, s48, 8
+; SI-NEXT:    s_lshl_b32 s18, s37, 24
+; SI-NEXT:    s_or_b32 s93, s18, s17
+; SI-NEXT:    s_and_b32 s17, s51, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s50, 8
 ; SI-NEXT:    s_or_b32 s17, s17, s18
-; SI-NEXT:    s_and_b32 s18, s38, 0xff
+; SI-NEXT:    s_and_b32 s18, s48, 0xff
 ; SI-NEXT:    s_lshl_b32 s18, s18, 16
-; SI-NEXT:    s_lshl_b32 s19, s39, 24
-; SI-NEXT:    s_or_b32 s18, s19, s18
-; SI-NEXT:    v_writelane_b32 v43, s18, 59
-; SI-NEXT:    s_and_b32 s18, s53, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s49, 24
+; SI-NEXT:    s_or_b32 s89, s19, s18
+; SI-NEXT:    s_and_b32 s18, s55, 0xff
 ; SI-NEXT:    s_lshl_b32 s18, s18, 16
-; SI-NEXT:    s_lshl_b32 s19, s52, 24
-; SI-NEXT:    s_or_b32 s18, s19, s18
-; SI-NEXT:    v_writelane_b32 v43, s18, 61
-; SI-NEXT:    s_and_b32 s18, s50, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s54, 24
+; SI-NEXT:    s_or_b32 s31, s19, s18
+; SI-NEXT:    s_and_b32 s18, s52, 0xff
 ; SI-NEXT:    s_lshl_b32 s18, s18, 16
-; SI-NEXT:    s_lshl_b32 s19, s51, 24
-; SI-NEXT:    s_or_b32 s18, s19, s18
-; SI-NEXT:    v_writelane_b32 v43, s18, 60
-; SI-NEXT:    s_and_b32 s18, s65, 0xff
-; SI-NEXT:    s_lshl_b32 s19, s64, 8
+; SI-NEXT:    s_lshl_b32 s19, s53, 24
+; SI-NEXT:    s_or_b32 s94, s19, s18
+; SI-NEXT:    s_and_b32 s18, s84, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s67, 8
 ; SI-NEXT:    s_or_b32 s18, s18, s19
-; SI-NEXT:    s_and_b32 s19, s54, 0xff
+; SI-NEXT:    s_and_b32 s19, s64, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s55, 24
-; SI-NEXT:    s_or_b32 s19, s20, s19
-; SI-NEXT:    v_writelane_b32 v43, s19, 62
-; SI-NEXT:    s_and_b32 s19, s87, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s65, 24
+; SI-NEXT:    s_or_b32 s60, s20, s19
+; SI-NEXT:    s_and_b32 s19, s12, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s85, 24
-; SI-NEXT:    s_or_b32 s19, s20, s19
-; SI-NEXT:    v_writelane_b32 v43, s19, 63
-; SI-NEXT:    s_and_b32 s19, s67, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s8, 24
+; SI-NEXT:    s_or_b32 s8, s20, s19
+; SI-NEXT:    s_and_b32 s19, s85, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s68, 24
-; SI-NEXT:    s_or_b32 s95, s20, s19
-; SI-NEXT:    s_and_b32 s19, s82, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s86, 24
+; SI-NEXT:    s_or_b32 s12, s20, s19
+; SI-NEXT:    s_and_b32 s19, s80, 0xff
 ; SI-NEXT:    s_lshl_b32 s20, s9, 8
 ; SI-NEXT:    s_or_b32 vcc_lo, s19, s20
-; SI-NEXT:    s_and_b32 s19, s99, 0xff
+; SI-NEXT:    s_and_b32 s19, s44, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s98, 24
-; SI-NEXT:    s_or_b32 s30, s20, s19
-; SI-NEXT:    s_and_b32 s19, s41, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s97, 24
+; SI-NEXT:    s_or_b32 s9, s20, s19
+; SI-NEXT:    s_and_b32 s19, s15, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s80, 24
-; SI-NEXT:    s_or_b32 s31, s20, s19
-; SI-NEXT:    s_and_b32 s19, s7, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s7, 24
+; SI-NEXT:    s_or_b32 s7, s20, s19
+; SI-NEXT:    s_and_b32 s19, s82, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s96, 24
-; SI-NEXT:    s_or_b32 s34, s20, s19
-; SI-NEXT:    s_and_b32 s19, s62, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s15, 8
+; SI-NEXT:    s_lshl_b32 s20, s83, 24
+; SI-NEXT:    s_or_b32 s23, s20, s19
+; SI-NEXT:    s_and_b32 s19, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s81, 8
 ; SI-NEXT:    s_or_b32 vcc_hi, s19, s20
-; SI-NEXT:    s_and_b32 s19, s86, 0xff
+; SI-NEXT:    s_and_b32 s19, s99, 0xff
+; SI-NEXT:    v_writelane_b32 v42, s9, 50
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s84, 24
-; SI-NEXT:    s_or_b32 s35, s20, s19
-; SI-NEXT:    s_and_b32 s19, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s87, 24
+; SI-NEXT:    v_writelane_b32 v42, s7, 51
+; SI-NEXT:    s_or_b32 s7, s20, s19
+; SI-NEXT:    s_and_b32 s19, s56, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s97, 24
-; SI-NEXT:    s_or_b32 s36, s20, s19
-; SI-NEXT:    s_and_b32 s19, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s41, 24
+; SI-NEXT:    v_writelane_b32 v42, s7, 52
+; SI-NEXT:    s_or_b32 s7, s20, s19
+; SI-NEXT:    s_and_b32 s19, s98, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s83, 24
-; SI-NEXT:    s_or_b32 s37, s20, s19
-; SI-NEXT:    s_and_b32 s19, s59, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s46, 8
+; SI-NEXT:    s_lshl_b32 s20, s96, 24
+; SI-NEXT:    v_writelane_b32 v42, s7, 54
+; SI-NEXT:    s_or_b32 s7, s20, s19
+; SI-NEXT:    s_and_b32 s19, s46, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s74, 8
 ; SI-NEXT:    s_or_b32 s84, s19, s20
-; SI-NEXT:    s_and_b32 s19, s81, 0xff
+; SI-NEXT:    s_and_b32 s19, s71, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s71, 24
-; SI-NEXT:    s_or_b32 s38, s20, s19
+; SI-NEXT:    s_lshl_b32 s20, s70, 24
+; SI-NEXT:    s_or_b32 s72, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s69, 24
-; SI-NEXT:    s_or_b32 s39, s20, s19
+; SI-NEXT:    s_lshl_b32 s20, s68, 24
+; SI-NEXT:    v_writelane_b32 v42, s7, 53
+; SI-NEXT:    s_or_b32 s7, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s14, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s70, 24
-; SI-NEXT:    s_or_b32 s48, s20, s19
-; SI-NEXT:    s_and_b32 s19, s29, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s69, 24
+; SI-NEXT:    s_or_b32 s9, s20, s19
+; SI-NEXT:    s_and_b32 s19, s58, 0xff
 ; SI-NEXT:    s_lshl_b32 s20, s66, 8
 ; SI-NEXT:    s_or_b32 s85, s19, s20
 ; SI-NEXT:    s_and_b32 s19, s10, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s40, 24
+; SI-NEXT:    s_lshl_b32 s20, s62, 24
 ; SI-NEXT:    s_or_b32 s49, s20, s19
 ; SI-NEXT:    s_and_b32 s19, s27, 0xff
+; SI-NEXT:    v_writelane_b32 v42, s9, 55
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s28, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 35
+; SI-NEXT:    v_readlane_b32 s9, v43, 33
 ; SI-NEXT:    s_or_b32 s50, s20, s19
-; SI-NEXT:    s_and_b32 s19, s7, 0xff
+; SI-NEXT:    s_and_b32 s19, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v43, 32
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s44, 24
+; SI-NEXT:    s_lshl_b32 s20, s9, 24
+; SI-NEXT:    v_readlane_b32 s9, v43, 31
 ; SI-NEXT:    s_or_b32 s51, s20, s19
-; SI-NEXT:    s_and_b32 s19, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s73, 8
+; SI-NEXT:    s_and_b32 s19, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v43, 30
+; SI-NEXT:    s_lshl_b32 s20, s9, 8
+; SI-NEXT:    v_readlane_b32 s9, v43, 29
 ; SI-NEXT:    s_or_b32 s86, s19, s20
-; SI-NEXT:    s_and_b32 s19, s72, 0xff
+; SI-NEXT:    s_and_b32 s19, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v43, 28
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s13, 24
+; SI-NEXT:    s_lshl_b32 s20, s9, 24
+; SI-NEXT:    v_readlane_b32 s9, v43, 27
 ; SI-NEXT:    s_or_b32 s52, s20, s19
-; SI-NEXT:    s_and_b32 s19, s42, 0xff
+; SI-NEXT:    s_and_b32 s19, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v43, 26
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s60, 24
+; SI-NEXT:    s_lshl_b32 s20, s9, 24
+; SI-NEXT:    v_readlane_b32 s9, v43, 25
 ; SI-NEXT:    s_or_b32 s53, s20, s19
-; SI-NEXT:    s_and_b32 s19, s45, 0xff
+; SI-NEXT:    s_and_b32 s19, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v43, 24
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s47, 24
+; SI-NEXT:    s_lshl_b32 s20, s9, 24
+; SI-NEXT:    v_readlane_b32 s9, v43, 23
 ; SI-NEXT:    s_or_b32 s54, s20, s19
-; SI-NEXT:    s_and_b32 s19, s76, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s74, 8
+; SI-NEXT:    s_and_b32 s19, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v43, 22
+; SI-NEXT:    s_lshl_b32 s20, s9, 8
+; SI-NEXT:    v_readlane_b32 s9, v43, 21
 ; SI-NEXT:    s_or_b32 s87, s19, s20
-; SI-NEXT:    s_and_b32 s19, s61, 0xff
+; SI-NEXT:    s_and_b32 s19, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v43, 20
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s56, 24
+; SI-NEXT:    s_lshl_b32 s20, s9, 24
+; SI-NEXT:    v_readlane_b32 s9, v43, 19
 ; SI-NEXT:    s_or_b32 s55, s20, s19
-; SI-NEXT:    s_and_b32 s19, s57, 0xff
+; SI-NEXT:    s_mov_b32 s58, s9
+; SI-NEXT:    s_and_b32 s19, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v43, 18
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s63, 24
+; SI-NEXT:    s_lshl_b32 s20, s9, 24
 ; SI-NEXT:    s_or_b32 s64, s20, s19
-; SI-NEXT:    s_and_b32 s19, s77, 0xff
+; SI-NEXT:    s_and_b32 s19, s78, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_lshl_b32 s20, s58, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 34
+; SI-NEXT:    s_lshl_b32 s20, s24, 24
 ; SI-NEXT:    s_or_b32 s65, s20, s19
-; SI-NEXT:    s_and_b32 s19, s26, 0xff
-; SI-NEXT:    s_mov_b32 s42, s7
-; SI-NEXT:    s_lshl_b32 s20, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v44, 33
-; SI-NEXT:    s_mov_b32 s94, s26
+; SI-NEXT:    s_and_b32 s19, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s45, 8
 ; SI-NEXT:    s_or_b32 s26, s19, s20
-; SI-NEXT:    s_mov_b32 s47, s7
-; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 32
-; SI-NEXT:    s_mov_b32 s92, s56
+; SI-NEXT:    s_and_b32 s19, s13, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_mov_b32 s56, s7
-; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 31
+; SI-NEXT:    s_lshl_b32 s20, s57, 24
 ; SI-NEXT:    s_or_b32 s66, s20, s19
-; SI-NEXT:    s_mov_b32 s61, s7
-; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 30
+; SI-NEXT:    s_and_b32 s19, s21, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_mov_b32 s60, s7
-; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 29
+; SI-NEXT:    s_lshl_b32 s20, s73, 24
 ; SI-NEXT:    s_or_b32 s67, s20, s19
-; SI-NEXT:    s_mov_b32 s68, s7
-; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 28
+; SI-NEXT:    s_and_b32 s19, s42, 0xff
+; SI-NEXT:    v_readlane_b32 s88, v43, 17
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_mov_b32 s59, s7
-; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 27
-; SI-NEXT:    s_or_b32 s45, s20, s19
-; SI-NEXT:    s_mov_b32 s46, s7
-; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 26
-; SI-NEXT:    s_mov_b32 s69, s7
-; SI-NEXT:    s_lshl_b32 s20, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v44, 25
+; SI-NEXT:    s_lshl_b32 s20, s59, 24
+; SI-NEXT:    s_or_b32 s68, s20, s19
+; SI-NEXT:    s_and_b32 s19, s63, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s88, 8
 ; SI-NEXT:    s_or_b32 s27, s19, s20
-; SI-NEXT:    s_mov_b32 s40, s7
-; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 24
-; SI-NEXT:    s_mov_b32 s89, s76
-; SI-NEXT:    s_mov_b32 s76, s58
+; SI-NEXT:    s_and_b32 s19, s40, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_mov_b32 s58, s7
-; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 23
-; SI-NEXT:    s_mov_b32 s93, s74
-; SI-NEXT:    s_mov_b32 s88, s57
-; SI-NEXT:    s_or_b32 s57, s20, s19
-; SI-NEXT:    s_mov_b32 s74, s7
-; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 22
-; SI-NEXT:    s_mov_b32 s90, s63
+; SI-NEXT:    s_lshl_b32 s20, s43, 24
+; SI-NEXT:    s_or_b32 s69, s20, s19
+; SI-NEXT:    s_and_b32 s19, s61, 0xff
+; SI-NEXT:    s_mov_b32 s39, s57
+; SI-NEXT:    s_mov_b32 s57, s7
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_mov_b32 s63, s7
-; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 21
+; SI-NEXT:    s_lshl_b32 s20, s75, 24
+; SI-NEXT:    v_readlane_b32 s7, v43, 16
 ; SI-NEXT:    s_or_b32 s70, s20, s19
-; SI-NEXT:    s_mov_b32 s71, s7
+; SI-NEXT:    s_mov_b32 s10, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 20
+; SI-NEXT:    v_readlane_b32 s7, v43, 15
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_mov_b32 s81, s7
+; SI-NEXT:    s_mov_b32 s71, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 19
-; SI-NEXT:    s_or_b32 s13, s20, s19
-; SI-NEXT:    s_mov_b32 s41, s7
+; SI-NEXT:    v_readlane_b32 s7, v43, 14
+; SI-NEXT:    s_or_b32 s62, s20, s19
+; SI-NEXT:    s_mov_b32 s15, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 18
-; SI-NEXT:    s_mov_b32 s14, s7
+; SI-NEXT:    v_readlane_b32 s7, v43, 13
+; SI-NEXT:    s_mov_b32 s41, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v44, 17
+; SI-NEXT:    v_readlane_b32 s7, v43, 12
 ; SI-NEXT:    s_or_b32 s29, s19, s20
-; SI-NEXT:    s_mov_b32 s10, s7
+; SI-NEXT:    s_mov_b32 s14, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 16
+; SI-NEXT:    v_readlane_b32 s7, v43, 11
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_mov_b32 s9, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 15
+; SI-NEXT:    v_readlane_b32 s7, v43, 10
 ; SI-NEXT:    s_or_b32 s80, s20, s19
-; SI-NEXT:    s_mov_b32 s8, s7
+; SI-NEXT:    s_mov_b32 s56, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 14
+; SI-NEXT:    v_readlane_b32 s7, v43, 9
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_mov_b32 s15, s7
+; SI-NEXT:    s_mov_b32 s81, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 13
-; SI-NEXT:    s_mov_b32 s72, s25
-; SI-NEXT:    s_or_b32 s25, s20, s19
-; SI-NEXT:    s_mov_b32 s83, s7
+; SI-NEXT:    v_readlane_b32 s7, v43, 8
+; SI-NEXT:    s_or_b32 s11, s20, s19
+; SI-NEXT:    s_mov_b32 s82, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 12
+; SI-NEXT:    v_readlane_b32 s7, v43, 7
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_mov_b32 s97, s7
+; SI-NEXT:    s_mov_b32 s96, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 11
-; SI-NEXT:    s_or_b32 s82, s20, s19
-; SI-NEXT:    s_mov_b32 s44, s7
+; SI-NEXT:    v_readlane_b32 s7, v43, 6
+; SI-NEXT:    s_mov_b32 s36, s63
+; SI-NEXT:    s_mov_b32 s63, s93
+; SI-NEXT:    s_mov_b32 s93, s61
+; SI-NEXT:    s_mov_b32 s61, s91
+; SI-NEXT:    s_mov_b32 s91, s75
+; SI-NEXT:    s_mov_b32 s75, s92
+; SI-NEXT:    s_or_b32 s92, s20, s19
+; SI-NEXT:    s_mov_b32 s98, s7
 ; SI-NEXT:    s_and_b32 s19, s7, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v44, 10
+; SI-NEXT:    v_readlane_b32 s7, v43, 5
+; SI-NEXT:    s_mov_b32 s44, s7
 ; SI-NEXT:    s_lshl_b32 s20, s7, 8
-; SI-NEXT:    v_readlane_b32 s12, v44, 9
-; SI-NEXT:    v_readlane_b32 s22, v44, 8
-; SI-NEXT:    s_mov_b32 s91, s77
-; SI-NEXT:    s_mov_b32 s77, s21
+; SI-NEXT:    v_readlane_b32 s7, v43, 4
+; SI-NEXT:    s_mov_b32 s48, s13
+; SI-NEXT:    s_mov_b32 s13, s94
+; SI-NEXT:    s_mov_b32 s94, s21
 ; SI-NEXT:    s_or_b32 s21, s19, s20
-; SI-NEXT:    s_and_b32 s19, s12, 0xff
+; SI-NEXT:    s_and_b32 s19, s7, 0xff
+; SI-NEXT:    s_mov_b32 s95, s4
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_lshl_b32 s20, s22, 24
-; SI-NEXT:    s_mov_b32 s73, s4
-; SI-NEXT:    s_mov_b32 s11, s7
-; SI-NEXT:    s_mov_b32 s7, s12
-; SI-NEXT:    s_mov_b32 s62, s22
-; SI-NEXT:    s_or_b32 s28, s20, s19
-; SI-NEXT:    s_lshl_b32 s20, s43, 16
-; SI-NEXT:    s_lshl_b32 s23, s24, 16
+; SI-NEXT:    v_readlane_b32 s4, v42, 58
+; SI-NEXT:    s_mov_b32 s46, s45
+; SI-NEXT:    s_mov_b32 s34, s73
+; SI-NEXT:    s_mov_b32 s73, s12
+; SI-NEXT:    s_mov_b32 s37, s42
+; SI-NEXT:    s_mov_b32 s38, s59
+; SI-NEXT:    s_mov_b32 s59, s8
+; SI-NEXT:    s_mov_b32 s30, s88
+; SI-NEXT:    s_mov_b32 s88, s31
+; SI-NEXT:    s_mov_b32 s78, s40
+; SI-NEXT:    s_mov_b32 s31, s43
+; SI-NEXT:    s_mov_b32 s12, s7
+; SI-NEXT:    s_mov_b32 s7, s22
+; SI-NEXT:    s_or_b32 s83, s20, s19
+; SI-NEXT:    s_lshl_b32 s20, s4, 16
+; SI-NEXT:    s_lshl_b32 s74, s5, 16
 ; SI-NEXT:    s_lshl_b32 s22, s6, 16
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
 ; SI-NEXT:    s_lshl_b32 s19, s17, 16
@@ -157840,48 +157795,50 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_lshl_b32 s17, vcc_lo, 16
 ; SI-NEXT:    s_lshl_b32 s6, vcc_hi, 16
 ; SI-NEXT:    s_lshl_b32 s99, s84, 16
-; SI-NEXT:    s_lshl_b32 s98, s85, 16
-; SI-NEXT:    s_lshl_b32 s12, s86, 16
-; SI-NEXT:    s_lshl_b32 s96, s87, 16
+; SI-NEXT:    s_lshl_b32 s8, s85, 16
+; SI-NEXT:    s_lshl_b32 s97, s86, 16
+; SI-NEXT:    s_lshl_b32 s28, s87, 16
 ; SI-NEXT:    s_lshl_b32 s87, s26, 16
-; SI-NEXT:    v_readlane_b32 s26, v42, 0
+; SI-NEXT:    v_readlane_b32 s26, v42, 56
 ; SI-NEXT:    s_lshl_b32 s86, s27, 16
-; SI-NEXT:    v_readlane_b32 s27, v42, 1
+; SI-NEXT:    v_readlane_b32 s27, v42, 57
+; SI-NEXT:    v_readlane_b32 s35, v42, 61
 ; SI-NEXT:    s_lshl_b32 s85, s29, 16
-; SI-NEXT:    v_readlane_b32 s29, v42, 3
-; SI-NEXT:    v_readlane_b32 s24, v42, 2
+; SI-NEXT:    v_readlane_b32 s29, v42, 60
+; SI-NEXT:    v_readlane_b32 s24, v42, 59
+; SI-NEXT:    v_readlane_b32 s90, v42, 62
 ; SI-NEXT:    s_lshl_b32 s84, s21, 16
-; SI-NEXT:    s_mov_b32 s21, s5
+; SI-NEXT:    s_mov_b32 s21, s47
 ; SI-NEXT:    s_cbranch_execnz .LBB89_3
 ; SI-NEXT:  .LBB89_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s4, s44, 3
+; SI-NEXT:    s_add_i32 s4, s98, 3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s11, 8
-; SI-NEXT:    s_add_i32 s6, s7, 3
+; SI-NEXT:    s_lshl_b32 s5, s44, 8
+; SI-NEXT:    s_add_i32 s6, s12, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s62, 24
+; SI-NEXT:    s_lshl_b32 s5, s7, 24
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s5, s8, 3
+; SI-NEXT:    s_add_i32 s5, s56, 3
 ; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s15, 8
-; SI-NEXT:    s_add_i32 s16, s83, 3
+; SI-NEXT:    s_lshl_b32 s6, s81, 8
+; SI-NEXT:    s_add_i32 s16, s82, 3
 ; SI-NEXT:    s_or_b32 s5, s6, s5
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s97, 24
+; SI-NEXT:    s_lshl_b32 s6, s96, 24
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
 ; SI-NEXT:    s_addk_i32 s5, 0x300
 ; SI-NEXT:    s_or_b32 s6, s6, s16
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_add_i32 s6, s41, 3
+; SI-NEXT:    s_add_i32 s6, s15, 3
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s16, s14, 8
-; SI-NEXT:    s_add_i32 s17, s10, 3
+; SI-NEXT:    s_lshl_b32 s16, s41, 8
+; SI-NEXT:    s_add_i32 s17, s14, 3
 ; SI-NEXT:    s_or_b32 s6, s16, s6
 ; SI-NEXT:    s_and_b32 s17, s17, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s9, 24
@@ -157890,156 +157847,162 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s16, s16, s17
 ; SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; SI-NEXT:    s_or_b32 s6, s16, s6
-; SI-NEXT:    s_add_i32 s16, s74, 3
+; SI-NEXT:    s_add_i32 s16, s93, 3
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s17, s63, 8
-; SI-NEXT:    s_add_i32 s18, s71, 3
+; SI-NEXT:    s_lshl_b32 s17, s91, 8
+; SI-NEXT:    s_add_i32 s18, s10, 3
 ; SI-NEXT:    s_or_b32 s16, s17, s16
 ; SI-NEXT:    s_and_b32 s18, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s17, s81, 24
+; SI-NEXT:    s_lshl_b32 s17, s71, 24
 ; SI-NEXT:    s_lshl_b32 s18, s18, 16
 ; SI-NEXT:    s_addk_i32 s16, 0x300
 ; SI-NEXT:    s_or_b32 s17, s17, s18
 ; SI-NEXT:    s_and_b32 s16, s16, 0xffff
 ; SI-NEXT:    s_or_b32 s16, s17, s16
-; SI-NEXT:    s_add_i32 s17, s46, 3
+; SI-NEXT:    s_add_i32 s17, s36, 3
 ; SI-NEXT:    s_and_b32 s17, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s18, s69, 8
-; SI-NEXT:    s_add_i32 s19, s40, 3
+; SI-NEXT:    s_lshl_b32 s18, s30, 8
+; SI-NEXT:    s_add_i32 s19, s78, 3
 ; SI-NEXT:    s_or_b32 s17, s18, s17
 ; SI-NEXT:    s_and_b32 s19, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s18, s58, 24
+; SI-NEXT:    s_lshl_b32 s18, s31, 24
 ; SI-NEXT:    s_lshl_b32 s19, s19, 16
 ; SI-NEXT:    s_addk_i32 s17, 0x300
 ; SI-NEXT:    s_or_b32 s18, s18, s19
 ; SI-NEXT:    s_and_b32 s17, s17, 0xffff
 ; SI-NEXT:    s_or_b32 s17, s18, s17
-; SI-NEXT:    s_add_i32 s18, s61, 3
+; SI-NEXT:    s_add_i32 s18, s94, 3
 ; SI-NEXT:    s_and_b32 s18, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s19, s60, 8
-; SI-NEXT:    s_add_i32 s20, s68, 3
+; SI-NEXT:    s_lshl_b32 s19, s34, 8
+; SI-NEXT:    s_add_i32 s20, s37, 3
 ; SI-NEXT:    s_or_b32 s18, s19, s18
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s19, s59, 24
+; SI-NEXT:    s_lshl_b32 s19, s38, 24
 ; SI-NEXT:    s_lshl_b32 s20, s20, 16
 ; SI-NEXT:    s_addk_i32 s18, 0x300
 ; SI-NEXT:    s_or_b32 s19, s19, s20
 ; SI-NEXT:    s_and_b32 s18, s18, 0xffff
 ; SI-NEXT:    s_or_b32 s18, s19, s18
-; SI-NEXT:    s_add_i32 s19, s94, 3
+; SI-NEXT:    s_add_i32 s19, s95, 3
 ; SI-NEXT:    s_and_b32 s19, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s42, 8
-; SI-NEXT:    s_add_i32 s22, s47, 3
+; SI-NEXT:    s_lshl_b32 s20, s46, 8
+; SI-NEXT:    s_add_i32 s22, s48, 3
 ; SI-NEXT:    s_or_b32 s19, s20, s19
 ; SI-NEXT:    s_and_b32 s22, s22, 0xff
-; SI-NEXT:    s_lshl_b32 s20, s56, 24
+; SI-NEXT:    s_lshl_b32 s20, s39, 24
 ; SI-NEXT:    s_lshl_b32 s22, s22, 16
 ; SI-NEXT:    s_addk_i32 s19, 0x300
 ; SI-NEXT:    s_or_b32 s20, s20, s22
 ; SI-NEXT:    s_and_b32 s19, s19, 0xffff
 ; SI-NEXT:    s_or_b32 s19, s20, s19
-; SI-NEXT:    s_add_i32 s20, s88, 3
+; SI-NEXT:    s_add_i32 s20, s58, 3
+; SI-NEXT:    v_readlane_b32 s7, v43, 18
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s22, s90, 8
-; SI-NEXT:    s_add_i32 s23, s91, 3
+; SI-NEXT:    s_lshl_b32 s22, s7, 8
+; SI-NEXT:    v_readlane_b32 s7, v42, 49
 ; SI-NEXT:    s_or_b32 s20, s22, s20
+; SI-NEXT:    s_lshl_b32 s22, s7, 24
+; SI-NEXT:    v_readlane_b32 s7, v42, 48
+; SI-NEXT:    s_add_i32 s23, s7, 3
 ; SI-NEXT:    s_and_b32 s23, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s22, s76, 24
 ; SI-NEXT:    s_lshl_b32 s23, s23, 16
 ; SI-NEXT:    s_addk_i32 s20, 0x300
 ; SI-NEXT:    s_or_b32 s22, s22, s23
 ; SI-NEXT:    s_and_b32 s20, s20, 0xffff
+; SI-NEXT:    v_readlane_b32 s7, v43, 23
 ; SI-NEXT:    s_or_b32 s20, s22, s20
-; SI-NEXT:    s_add_i32 s22, s89, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 55
+; SI-NEXT:    s_add_i32 s22, s7, 3
+; SI-NEXT:    v_readlane_b32 s7, v43, 22
 ; SI-NEXT:    s_and_b32 s22, s22, 0xff
-; SI-NEXT:    s_lshl_b32 s23, s93, 8
-; SI-NEXT:    s_add_i32 s60, s7, 3
+; SI-NEXT:    s_lshl_b32 s23, s7, 8
+; SI-NEXT:    v_readlane_b32 s7, v43, 20
 ; SI-NEXT:    s_or_b32 s22, s23, s22
+; SI-NEXT:    s_lshl_b32 s23, s7, 24
+; SI-NEXT:    v_readlane_b32 s7, v43, 21
+; SI-NEXT:    s_add_i32 s60, s7, 3
 ; SI-NEXT:    s_and_b32 s60, s60, 0xff
-; SI-NEXT:    s_lshl_b32 s23, s92, 24
 ; SI-NEXT:    s_lshl_b32 s60, s60, 16
 ; SI-NEXT:    s_addk_i32 s22, 0x300
 ; SI-NEXT:    s_or_b32 s23, s23, s60
 ; SI-NEXT:    s_and_b32 s22, s22, 0xffff
-; SI-NEXT:    v_readlane_b32 s7, v44, 47
+; SI-NEXT:    v_readlane_b32 s7, v43, 27
 ; SI-NEXT:    s_or_b32 s22, s23, s22
 ; SI-NEXT:    s_add_i32 s23, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v44, 45
+; SI-NEXT:    v_readlane_b32 s7, v43, 26
 ; SI-NEXT:    s_and_b32 s23, s23, 0xff
 ; SI-NEXT:    s_lshl_b32 s60, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 54
+; SI-NEXT:    v_readlane_b32 s7, v43, 24
 ; SI-NEXT:    s_or_b32 s23, s60, s23
 ; SI-NEXT:    s_lshl_b32 s60, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 53
+; SI-NEXT:    v_readlane_b32 s7, v43, 25
 ; SI-NEXT:    s_add_i32 s61, s7, 3
 ; SI-NEXT:    s_and_b32 s61, s61, 0xff
 ; SI-NEXT:    s_lshl_b32 s61, s61, 16
 ; SI-NEXT:    s_addk_i32 s23, 0x300
 ; SI-NEXT:    s_or_b32 s60, s60, s61
 ; SI-NEXT:    s_and_b32 s23, s23, 0xffff
-; SI-NEXT:    v_readlane_b32 s7, v44, 48
+; SI-NEXT:    v_readlane_b32 s7, v43, 31
 ; SI-NEXT:    s_or_b32 s23, s60, s23
 ; SI-NEXT:    s_add_i32 s60, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v44, 46
+; SI-NEXT:    v_readlane_b32 s7, v43, 30
 ; SI-NEXT:    s_and_b32 s60, s60, 0xff
 ; SI-NEXT:    s_lshl_b32 s61, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v44, 44
+; SI-NEXT:    v_readlane_b32 s7, v43, 28
 ; SI-NEXT:    s_or_b32 s60, s61, s60
 ; SI-NEXT:    s_lshl_b32 s61, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 43
+; SI-NEXT:    v_readlane_b32 s7, v43, 29
 ; SI-NEXT:    s_add_i32 s62, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 52
+; SI-NEXT:    v_readlane_b32 s7, v42, 47
 ; SI-NEXT:    s_and_b32 s62, s62, 0xff
 ; SI-NEXT:    s_add_i32 s59, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 51
+; SI-NEXT:    v_readlane_b32 s7, v42, 46
 ; SI-NEXT:    s_lshl_b32 s62, s62, 16
 ; SI-NEXT:    s_addk_i32 s60, 0x300
 ; SI-NEXT:    s_and_b32 s59, s59, 0xff
 ; SI-NEXT:    s_lshl_b32 s58, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v44, 42
+; SI-NEXT:    v_readlane_b32 s7, v43, 32
 ; SI-NEXT:    s_or_b32 s61, s61, s62
 ; SI-NEXT:    s_and_b32 s60, s60, 0xffff
 ; SI-NEXT:    s_or_b32 s58, s58, s59
 ; SI-NEXT:    s_lshl_b32 s59, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v44, 35
+; SI-NEXT:    v_readlane_b32 s7, v43, 33
 ; SI-NEXT:    s_or_b32 s60, s61, s60
 ; SI-NEXT:    s_add_i32 s61, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 50
+; SI-NEXT:    v_readlane_b32 s7, v42, 45
 ; SI-NEXT:    s_add_i32 s57, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 49
+; SI-NEXT:    v_readlane_b32 s7, v42, 44
 ; SI-NEXT:    s_lshl_b32 s56, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 48
+; SI-NEXT:    v_readlane_b32 s7, v42, 43
 ; SI-NEXT:    s_lshl_b32 s47, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 47
+; SI-NEXT:    v_readlane_b32 s7, v42, 42
 ; SI-NEXT:    s_add_i32 s46, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 46
+; SI-NEXT:    v_readlane_b32 s7, v42, 41
 ; SI-NEXT:    s_add_i32 s45, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 43
+; SI-NEXT:    v_readlane_b32 s7, v42, 38
 ; SI-NEXT:    s_lshl_b32 s42, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 40
+; SI-NEXT:    v_readlane_b32 s7, v42, 35
 ; SI-NEXT:    s_lshl_b32 s15, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 39
+; SI-NEXT:    v_readlane_b32 s7, v42, 34
 ; SI-NEXT:    s_and_b32 s45, s45, 0xff
 ; SI-NEXT:    s_add_i32 s14, s7, 3
 ; SI-NEXT:    s_or_b32 s42, s42, s45
 ; SI-NEXT:    s_and_b32 s14, s14, 0xff
 ; SI-NEXT:    s_lshl_b32 s14, s14, 16
 ; SI-NEXT:    s_addk_i32 s42, 0x300
-; SI-NEXT:    v_readlane_b32 s7, v43, 45
+; SI-NEXT:    v_readlane_b32 s7, v42, 40
 ; SI-NEXT:    s_and_b32 s57, s57, 0xff
 ; SI-NEXT:    s_or_b32 s14, s15, s14
 ; SI-NEXT:    s_and_b32 s15, s42, 0xffff
 ; SI-NEXT:    s_add_i32 s44, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 44
+; SI-NEXT:    v_readlane_b32 s7, v42, 39
 ; SI-NEXT:    s_or_b32 s56, s56, s57
 ; SI-NEXT:    s_or_b32 s57, s14, s15
 ; SI-NEXT:    s_and_b32 s14, s44, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 42
+; SI-NEXT:    v_readlane_b32 s7, v42, 37
 ; SI-NEXT:    s_or_b32 s14, s15, s14
 ; SI-NEXT:    s_lshl_b32 s15, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 41
+; SI-NEXT:    v_readlane_b32 s7, v42, 36
 ; SI-NEXT:    s_add_i32 s40, s7, 3
 ; SI-NEXT:    s_and_b32 s61, s61, 0xff
 ; SI-NEXT:    s_and_b32 s40, s40, 0xff
@@ -158054,15 +158017,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s58, s59, s58
 ; SI-NEXT:    s_or_b32 s59, s15, s14
 ; SI-NEXT:    s_add_i32 s14, s6, 0x3000000
-; SI-NEXT:    v_readlane_b32 s6, v43, 36
+; SI-NEXT:    v_readlane_b32 s6, v42, 31
 ; SI-NEXT:    s_add_i32 s11, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 33
+; SI-NEXT:    v_readlane_b32 s7, v42, 28
 ; SI-NEXT:    s_and_b32 s6, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s8, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 30
+; SI-NEXT:    v_readlane_b32 s7, v42, 25
 ; SI-NEXT:    s_or_b32 s6, s8, s6
 ; SI-NEXT:    s_lshl_b32 s8, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 29
+; SI-NEXT:    v_readlane_b32 s7, v42, 24
 ; SI-NEXT:    s_add_i32 s24, s7, 3
 ; SI-NEXT:    s_and_b32 s11, s24, 0xff
 ; SI-NEXT:    s_addk_i32 s6, 0x300
@@ -158070,47 +158033,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; SI-NEXT:    s_or_b32 s8, s8, s11
 ; SI-NEXT:    s_or_b32 s8, s8, s6
-; SI-NEXT:    v_readlane_b32 s6, v43, 37
+; SI-NEXT:    v_readlane_b32 s6, v42, 32
 ; SI-NEXT:    s_add_i32 s12, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 31
+; SI-NEXT:    v_readlane_b32 s7, v42, 26
 ; SI-NEXT:    s_and_b32 s6, s12, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 21
+; SI-NEXT:    v_readlane_b32 s7, v42, 16
 ; SI-NEXT:    s_or_b32 s6, s11, s6
 ; SI-NEXT:    s_lshl_b32 s11, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 23
+; SI-NEXT:    v_readlane_b32 s7, v42, 18
 ; SI-NEXT:    s_add_i32 s12, s7, 3
 ; SI-NEXT:    s_and_b32 s12, s12, 0xff
 ; SI-NEXT:    s_addk_i32 s6, 0x300
 ; SI-NEXT:    s_lshl_b32 s12, s12, 16
-; SI-NEXT:    v_readlane_b32 s7, v43, 38
+; SI-NEXT:    v_readlane_b32 s7, v42, 33
 ; SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; SI-NEXT:    s_or_b32 s11, s11, s12
 ; SI-NEXT:    s_add_i32 s13, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 35
+; SI-NEXT:    v_readlane_b32 s7, v42, 30
 ; SI-NEXT:    s_or_b32 s6, s11, s6
 ; SI-NEXT:    s_and_b32 s11, s13, 0xff
 ; SI-NEXT:    s_lshl_b32 s10, s7, 8
-; SI-NEXT:    v_readlane_b32 s7, v43, 27
+; SI-NEXT:    v_readlane_b32 s7, v42, 22
 ; SI-NEXT:    s_or_b32 s10, s10, s11
 ; SI-NEXT:    s_lshl_b32 s11, s7, 24
-; SI-NEXT:    v_readlane_b32 s7, v43, 28
+; SI-NEXT:    v_readlane_b32 s7, v42, 23
 ; SI-NEXT:    s_add_i32 s25, s7, 3
 ; SI-NEXT:    s_and_b32 s12, s25, 0xff
 ; SI-NEXT:    s_addk_i32 s10, 0x300
 ; SI-NEXT:    s_lshl_b32 s12, s12, 16
 ; SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; SI-NEXT:    s_or_b32 s11, s11, s12
-; SI-NEXT:    v_readlane_b32 s7, v43, 34
+; SI-NEXT:    v_readlane_b32 s7, v42, 29
 ; SI-NEXT:    s_or_b32 s10, s11, s10
 ; SI-NEXT:    s_add_i32 s9, s7, 3
-; SI-NEXT:    v_readlane_b32 s7, v43, 32
-; SI-NEXT:    v_readlane_b32 s11, v43, 25
+; SI-NEXT:    v_readlane_b32 s7, v42, 27
+; SI-NEXT:    v_readlane_b32 s11, v42, 20
 ; SI-NEXT:    s_and_b32 s9, s9, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_add_i32 s11, s11, 3
 ; SI-NEXT:    s_or_b32 s7, s7, s9
-; SI-NEXT:    v_readlane_b32 s9, v43, 26
+; SI-NEXT:    v_readlane_b32 s9, v42, 21
 ; SI-NEXT:    s_and_b32 s11, s11, 0xff
 ; SI-NEXT:    s_addk_i32 s7, 0x300
 ; SI-NEXT:    s_lshl_b32 s9, s9, 24
@@ -158118,15 +158081,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s7, s7, 0xffff
 ; SI-NEXT:    s_or_b32 s9, s9, s11
 ; SI-NEXT:    s_or_b32 s7, s9, s7
-; SI-NEXT:    v_readlane_b32 s9, v43, 24
+; SI-NEXT:    v_readlane_b32 s9, v42, 19
 ; SI-NEXT:    s_add_i32 s21, s9, 3
-; SI-NEXT:    v_readlane_b32 s11, v43, 22
-; SI-NEXT:    v_readlane_b32 s12, v43, 19
+; SI-NEXT:    v_readlane_b32 s11, v42, 17
+; SI-NEXT:    v_readlane_b32 s12, v42, 14
 ; SI-NEXT:    s_and_b32 s9, s21, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 8
 ; SI-NEXT:    s_add_i32 s12, s12, 3
 ; SI-NEXT:    s_or_b32 s9, s11, s9
-; SI-NEXT:    v_readlane_b32 s11, v43, 20
+; SI-NEXT:    v_readlane_b32 s11, v42, 15
 ; SI-NEXT:    s_and_b32 s12, s12, 0xff
 ; SI-NEXT:    s_addk_i32 s9, 0x300
 ; SI-NEXT:    s_lshl_b32 s11, s11, 24
@@ -158134,15 +158097,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s9, s9, 0xffff
 ; SI-NEXT:    s_or_b32 s11, s11, s12
 ; SI-NEXT:    s_or_b32 s9, s11, s9
-; SI-NEXT:    v_readlane_b32 s11, v43, 18
+; SI-NEXT:    v_readlane_b32 s11, v42, 13
 ; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    v_readlane_b32 s12, v43, 17
-; SI-NEXT:    v_readlane_b32 s13, v43, 15
+; SI-NEXT:    v_readlane_b32 s12, v42, 12
+; SI-NEXT:    v_readlane_b32 s13, v42, 10
 ; SI-NEXT:    s_and_b32 s11, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s12, s12, 8
 ; SI-NEXT:    s_add_i32 s13, s13, 3
 ; SI-NEXT:    s_or_b32 s11, s12, s11
-; SI-NEXT:    v_readlane_b32 s12, v43, 16
+; SI-NEXT:    v_readlane_b32 s12, v42, 11
 ; SI-NEXT:    s_and_b32 s13, s13, 0xff
 ; SI-NEXT:    s_addk_i32 s11, 0x300
 ; SI-NEXT:    s_lshl_b32 s12, s12, 24
@@ -158150,16 +158113,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; SI-NEXT:    s_or_b32 s12, s12, s13
 ; SI-NEXT:    s_or_b32 s11, s12, s11
-; SI-NEXT:    v_readlane_b32 s12, v43, 14
+; SI-NEXT:    v_readlane_b32 s12, v42, 9
 ; SI-NEXT:    s_add_i32 s15, s16, 0x3000000
 ; SI-NEXT:    s_add_i32 s12, s12, 3
-; SI-NEXT:    v_readlane_b32 s13, v43, 13
-; SI-NEXT:    v_readlane_b32 s16, v43, 11
+; SI-NEXT:    v_readlane_b32 s13, v42, 8
+; SI-NEXT:    v_readlane_b32 s16, v42, 6
 ; SI-NEXT:    s_and_b32 s12, s12, 0xff
 ; SI-NEXT:    s_lshl_b32 s13, s13, 8
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_or_b32 s12, s13, s12
-; SI-NEXT:    v_readlane_b32 s13, v43, 12
+; SI-NEXT:    v_readlane_b32 s13, v42, 7
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
 ; SI-NEXT:    s_addk_i32 s12, 0x300
 ; SI-NEXT:    s_lshl_b32 s13, s13, 24
@@ -158167,16 +158130,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; SI-NEXT:    s_or_b32 s13, s13, s16
 ; SI-NEXT:    s_or_b32 s12, s13, s12
-; SI-NEXT:    v_readlane_b32 s13, v43, 10
+; SI-NEXT:    v_readlane_b32 s13, v42, 5
 ; SI-NEXT:    s_add_i32 s40, s17, 0x3000000
 ; SI-NEXT:    s_add_i32 s13, s13, 3
-; SI-NEXT:    v_readlane_b32 s16, v43, 9
-; SI-NEXT:    v_readlane_b32 s17, v43, 7
+; SI-NEXT:    v_readlane_b32 s16, v42, 4
+; SI-NEXT:    v_readlane_b32 s17, v42, 2
 ; SI-NEXT:    s_and_b32 s13, s13, 0xff
 ; SI-NEXT:    s_lshl_b32 s16, s16, 8
 ; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_or_b32 s13, s16, s13
-; SI-NEXT:    v_readlane_b32 s16, v43, 8
+; SI-NEXT:    v_readlane_b32 s16, v42, 3
 ; SI-NEXT:    s_and_b32 s17, s17, 0xff
 ; SI-NEXT:    s_addk_i32 s13, 0x300
 ; SI-NEXT:    s_lshl_b32 s16, s16, 24
@@ -158184,16 +158147,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s13, s13, 0xffff
 ; SI-NEXT:    s_or_b32 s16, s16, s17
 ; SI-NEXT:    s_or_b32 s13, s16, s13
-; SI-NEXT:    v_readlane_b32 s16, v43, 6
+; SI-NEXT:    v_readlane_b32 s16, v42, 1
 ; SI-NEXT:    s_add_i32 s41, s18, 0x3000000
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    v_readlane_b32 s17, v43, 5
-; SI-NEXT:    v_readlane_b32 s18, v43, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 0
+; SI-NEXT:    v_readlane_b32 s18, v43, 62
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s17, 8
 ; SI-NEXT:    s_add_i32 s18, s18, 3
 ; SI-NEXT:    s_or_b32 s16, s17, s16
-; SI-NEXT:    v_readlane_b32 s17, v43, 4
+; SI-NEXT:    v_readlane_b32 s17, v43, 63
 ; SI-NEXT:    s_and_b32 s18, s18, 0xff
 ; SI-NEXT:    s_addk_i32 s16, 0x300
 ; SI-NEXT:    s_lshl_b32 s17, s17, 24
@@ -158202,16 +158165,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s17, s17, s18
 ; SI-NEXT:    s_or_b32 s16, s17, s16
 ; SI-NEXT:    s_add_i32 s17, s16, 0x3000000
-; SI-NEXT:    v_readlane_b32 s16, v43, 2
+; SI-NEXT:    v_readlane_b32 s16, v43, 61
 ; SI-NEXT:    s_add_i32 s42, s19, 0x3000000
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    v_readlane_b32 s18, v43, 1
-; SI-NEXT:    v_readlane_b32 s19, v44, 63
+; SI-NEXT:    v_readlane_b32 s18, v43, 60
+; SI-NEXT:    v_readlane_b32 s19, v43, 58
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s18, s18, 8
 ; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_or_b32 s16, s18, s16
-; SI-NEXT:    v_readlane_b32 s18, v43, 0
+; SI-NEXT:    v_readlane_b32 s18, v43, 59
 ; SI-NEXT:    s_and_b32 s19, s19, 0xff
 ; SI-NEXT:    s_addk_i32 s16, 0x300
 ; SI-NEXT:    s_lshl_b32 s18, s18, 24
@@ -158219,16 +158182,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s16, s16, 0xffff
 ; SI-NEXT:    s_or_b32 s18, s18, s19
 ; SI-NEXT:    s_or_b32 s16, s18, s16
-; SI-NEXT:    v_readlane_b32 s18, v44, 62
+; SI-NEXT:    v_readlane_b32 s18, v43, 57
 ; SI-NEXT:    s_add_i32 s43, s20, 0x3000000
 ; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    v_readlane_b32 s19, v44, 61
-; SI-NEXT:    v_readlane_b32 s20, v44, 59
+; SI-NEXT:    v_readlane_b32 s19, v43, 56
+; SI-NEXT:    v_readlane_b32 s20, v43, 54
 ; SI-NEXT:    s_and_b32 s18, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s19, s19, 8
 ; SI-NEXT:    s_add_i32 s20, s20, 3
 ; SI-NEXT:    s_or_b32 s18, s19, s18
-; SI-NEXT:    v_readlane_b32 s19, v44, 60
+; SI-NEXT:    v_readlane_b32 s19, v43, 55
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_addk_i32 s18, 0x300
 ; SI-NEXT:    s_lshl_b32 s19, s19, 24
@@ -158236,15 +158199,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s18, s18, 0xffff
 ; SI-NEXT:    s_or_b32 s19, s19, s20
 ; SI-NEXT:    s_or_b32 s18, s19, s18
-; SI-NEXT:    v_readlane_b32 s19, v44, 58
+; SI-NEXT:    v_readlane_b32 s19, v43, 53
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_readlane_b32 s20, v44, 57
-; SI-NEXT:    v_readlane_b32 s21, v44, 55
+; SI-NEXT:    v_readlane_b32 s20, v43, 52
+; SI-NEXT:    v_readlane_b32 s21, v43, 50
 ; SI-NEXT:    s_and_b32 s19, s19, 0xff
 ; SI-NEXT:    s_lshl_b32 s20, s20, 8
 ; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_or_b32 s19, s20, s19
-; SI-NEXT:    v_readlane_b32 s20, v44, 56
+; SI-NEXT:    v_readlane_b32 s20, v43, 51
 ; SI-NEXT:    s_and_b32 s21, s21, 0xff
 ; SI-NEXT:    s_addk_i32 s19, 0x300
 ; SI-NEXT:    s_lshl_b32 s20, s20, 24
@@ -158252,16 +158215,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_and_b32 s19, s19, 0xffff
 ; SI-NEXT:    s_or_b32 s20, s20, s21
 ; SI-NEXT:    s_or_b32 s19, s20, s19
-; SI-NEXT:    v_readlane_b32 s20, v44, 54
+; SI-NEXT:    v_readlane_b32 s20, v43, 49
 ; SI-NEXT:    s_add_i32 s44, s22, 0x3000000
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_readlane_b32 s21, v44, 53
-; SI-NEXT:    v_readlane_b32 s22, v44, 51
+; SI-NEXT:    v_readlane_b32 s21, v43, 48
+; SI-NEXT:    v_readlane_b32 s22, v43, 46
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s21, s21, 8
 ; SI-NEXT:    s_add_i32 s22, s22, 3
 ; SI-NEXT:    s_or_b32 s20, s21, s20
-; SI-NEXT:    v_readlane_b32 s21, v44, 52
+; SI-NEXT:    v_readlane_b32 s21, v43, 47
 ; SI-NEXT:    s_and_b32 s22, s22, 0xff
 ; SI-NEXT:    s_addk_i32 s20, 0x300
 ; SI-NEXT:    s_lshl_b32 s21, s21, 24
@@ -158270,16 +158233,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s21, s21, s22
 ; SI-NEXT:    s_or_b32 s20, s21, s20
 ; SI-NEXT:    s_add_i32 s21, s20, 0x3000000
-; SI-NEXT:    v_readlane_b32 s20, v44, 1
+; SI-NEXT:    v_readlane_b32 s20, v43, 43
 ; SI-NEXT:    s_add_i32 s45, s23, 0x3000000
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_readlane_b32 s22, v44, 0
-; SI-NEXT:    v_readlane_b32 s23, v44, 49
+; SI-NEXT:    v_readlane_b32 s22, v43, 42
+; SI-NEXT:    v_readlane_b32 s23, v43, 44
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s22, s22, 8
 ; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_or_b32 s20, s22, s20
-; SI-NEXT:    v_readlane_b32 s22, v44, 50
+; SI-NEXT:    v_readlane_b32 s22, v43, 45
 ; SI-NEXT:    s_and_b32 s23, s23, 0xff
 ; SI-NEXT:    s_addk_i32 s20, 0x300
 ; SI-NEXT:    s_lshl_b32 s22, s22, 24
@@ -158288,15 +158251,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s22, s22, s23
 ; SI-NEXT:    s_or_b32 s20, s22, s20
 ; SI-NEXT:    s_add_i32 s22, s20, 0x3000000
-; SI-NEXT:    v_readlane_b32 s20, v44, 41
+; SI-NEXT:    v_readlane_b32 s20, v43, 41
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_readlane_b32 s23, v44, 40
-; SI-NEXT:    v_readlane_b32 s24, v44, 3
+; SI-NEXT:    v_readlane_b32 s23, v43, 40
+; SI-NEXT:    v_readlane_b32 s24, v43, 38
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s23, s23, 8
 ; SI-NEXT:    s_add_i32 s24, s24, 3
 ; SI-NEXT:    s_or_b32 s20, s23, s20
-; SI-NEXT:    v_readlane_b32 s23, v44, 2
+; SI-NEXT:    v_readlane_b32 s23, v43, 39
 ; SI-NEXT:    s_and_b32 s24, s24, 0xff
 ; SI-NEXT:    s_addk_i32 s20, 0x300
 ; SI-NEXT:    s_lshl_b32 s23, s23, 24
@@ -158305,136 +158268,134 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_or_b32 s23, s23, s24
 ; SI-NEXT:    s_or_b32 s20, s23, s20
 ; SI-NEXT:    s_add_i32 s23, s20, 0x3000000
-; SI-NEXT:    v_readlane_b32 s20, v44, 39
+; SI-NEXT:    v_readlane_b32 s20, v43, 37
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_readlane_b32 s24, v44, 38
-; SI-NEXT:    v_readlane_b32 s25, v44, 36
+; SI-NEXT:    v_readlane_b32 s24, v43, 36
+; SI-NEXT:    v_readlane_b32 s25, v43, 34
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s24, s24, 8
 ; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_or_b32 s20, s24, s20
-; SI-NEXT:    v_readlane_b32 s24, v44, 37
+; SI-NEXT:    v_readlane_b32 s24, v43, 35
 ; SI-NEXT:    s_and_b32 s25, s25, 0xff
 ; SI-NEXT:    s_addk_i32 s20, 0x300
 ; SI-NEXT:    s_lshl_b32 s24, s24, 24
 ; SI-NEXT:    s_lshl_b32 s25, s25, 16
 ; SI-NEXT:    s_and_b32 s20, s20, 0xffff
 ; SI-NEXT:    s_or_b32 s24, s24, s25
-; SI-NEXT:    s_add_i32 s16, s16, 0x3000000
-; SI-NEXT:    s_add_i32 s18, s18, 0x3000000
+; SI-NEXT:    s_and_b32 s46, s46, 0xff
 ; SI-NEXT:    s_or_b32 s20, s24, s20
-; SI-NEXT:    v_readlane_b32 s24, v44, 7
+; SI-NEXT:    v_readlane_b32 s24, v43, 3
+; SI-NEXT:    s_lshl_b32 s46, s46, 16
+; SI-NEXT:    s_addk_i32 s56, 0x300
 ; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    v_readlane_b32 s25, v44, 6
-; SI-NEXT:    v_readlane_b32 s26, v44, 5
-; SI-NEXT:    s_and_b32 s79, s18, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s77, s18, 16
-; SI-NEXT:    s_and_b32 s18, s16, 0xffff0000
+; SI-NEXT:    v_readlane_b32 s25, v43, 2
+; SI-NEXT:    v_readlane_b32 s26, v43, 1
+; SI-NEXT:    s_or_b32 s46, s47, s46
+; SI-NEXT:    s_and_b32 s47, s56, 0xffff
+; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
+; SI-NEXT:    s_add_i32 s9, s9, 0x3000000
 ; SI-NEXT:    s_and_b32 s24, s24, 0xff
 ; SI-NEXT:    s_lshl_b32 s25, s25, 8
 ; SI-NEXT:    s_add_i32 s26, s26, 3
-; SI-NEXT:    v_writelane_b32 v43, s18, 56
-; SI-NEXT:    s_and_b32 s18, s17, 0xffff0000
-; SI-NEXT:    s_and_b32 s46, s46, 0xff
-; SI-NEXT:    s_add_i32 s13, s13, 0x3000000
+; SI-NEXT:    s_or_b32 s56, s46, s47
+; SI-NEXT:    s_add_i32 s47, s58, 0x3000000
+; SI-NEXT:    s_add_i32 s58, s59, 0x3000000
+; SI-NEXT:    s_add_i32 s10, s10, 0x3000000
 ; SI-NEXT:    s_or_b32 s24, s25, s24
-; SI-NEXT:    v_readlane_b32 s25, v44, 4
+; SI-NEXT:    v_readlane_b32 s25, v43, 0
 ; SI-NEXT:    s_and_b32 s26, s26, 0xff
-; SI-NEXT:    v_writelane_b32 v43, s18, 57
-; SI-NEXT:    s_lshl_b32 s17, s17, 16
-; SI-NEXT:    s_lshl_b32 s46, s46, 16
-; SI-NEXT:    s_addk_i32 s56, 0x300
-; SI-NEXT:    s_add_i32 s12, s12, 0x3000000
-; SI-NEXT:    s_add_i32 s19, s19, 0x3000000
+; SI-NEXT:    s_and_b32 s73, s9, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s59, s9, 16
+; SI-NEXT:    s_and_b32 s9, s7, 0xffff0000
+; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
 ; SI-NEXT:    s_addk_i32 s24, 0x300
 ; SI-NEXT:    s_lshl_b32 s25, s25, 24
 ; SI-NEXT:    s_lshl_b32 s26, s26, 16
-; SI-NEXT:    v_writelane_b32 v43, s17, 58
-; SI-NEXT:    s_and_b32 s17, s13, 0xffff0000
-; SI-NEXT:    s_or_b32 s46, s47, s46
-; SI-NEXT:    s_and_b32 s47, s56, 0xffff
+; SI-NEXT:    s_and_b32 s63, s17, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s79, s17, 16
+; SI-NEXT:    v_writelane_b32 v42, s9, 50
+; SI-NEXT:    s_lshl_b32 s17, s7, 16
+; SI-NEXT:    s_lshl_b32 s7, s10, 16
+; SI-NEXT:    s_add_i32 s8, s8, 0x3000000
 ; SI-NEXT:    s_and_b32 s24, s24, 0xffff
 ; SI-NEXT:    s_or_b32 s25, s25, s26
-; SI-NEXT:    s_and_b32 s72, s23, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s29, s23, 16
-; SI-NEXT:    s_and_b32 s73, s22, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s23, s22, 16
-; SI-NEXT:    s_and_b32 s78, s19, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s22, s19, 16
-; SI-NEXT:    v_writelane_b32 v43, s17, 59
-; SI-NEXT:    s_lshl_b32 s19, s13, 16
-; SI-NEXT:    s_and_b32 s13, s12, 0xffff0000
-; SI-NEXT:    s_or_b32 s56, s46, s47
-; SI-NEXT:    s_add_i32 s11, s11, 0x3000000
+; SI-NEXT:    v_writelane_b32 v42, s7, 51
+; SI-NEXT:    s_and_b32 s7, s6, 0xffff0000
 ; SI-NEXT:    s_or_b32 s24, s25, s24
-; SI-NEXT:    v_writelane_b32 v43, s13, 60
-; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    v_writelane_b32 v42, s7, 52
+; SI-NEXT:    s_and_b32 s7, s8, 0xffff0000
 ; SI-NEXT:    s_add_i32 s4, s4, 0x3000000
 ; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
 ; SI-NEXT:    s_add_i32 s46, s60, 0x3000000
-; SI-NEXT:    s_add_i32 s47, s58, 0x3000000
 ; SI-NEXT:    s_add_i32 s56, s56, 0x3000000
 ; SI-NEXT:    s_add_i32 s57, s57, 0x3000000
-; SI-NEXT:    s_add_i32 s58, s59, 0x3000000
-; SI-NEXT:    s_add_i32 s8, s8, 0x3000000
-; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
-; SI-NEXT:    s_add_i32 s10, s10, 0x3000000
-; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
-; SI-NEXT:    s_add_i32 s9, s9, 0x3000000
+; SI-NEXT:    s_add_i32 s11, s11, 0x3000000
+; SI-NEXT:    s_add_i32 s12, s12, 0x3000000
+; SI-NEXT:    s_add_i32 s13, s13, 0x3000000
+; SI-NEXT:    s_add_i32 s16, s16, 0x3000000
+; SI-NEXT:    s_add_i32 s18, s18, 0x3000000
+; SI-NEXT:    s_add_i32 s19, s19, 0x3000000
 ; SI-NEXT:    s_add_i32 s20, s20, 0x3000000
 ; SI-NEXT:    s_add_i32 s24, s24, 0x3000000
-; SI-NEXT:    v_writelane_b32 v43, s12, 61
-; SI-NEXT:    s_and_b32 s12, s11, 0xffff0000
+; SI-NEXT:    v_writelane_b32 v42, s7, 53
+; SI-NEXT:    s_lshl_b32 s7, s8, 16
 ; SI-NEXT:    s_and_b32 s27, s24, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s26, s24, 16
 ; SI-NEXT:    s_and_b32 s24, s20, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s20, s20, 16
-; SI-NEXT:    s_and_b32 s75, s21, 0xffff0000
+; SI-NEXT:    s_and_b32 s35, s23, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s29, s23, 16
+; SI-NEXT:    s_and_b32 s90, s22, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s74, s22, 16
+; SI-NEXT:    s_and_b32 s25, s21, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s21, s21, 16
+; SI-NEXT:    s_and_b32 s75, s19, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s22, s19, 16
+; SI-NEXT:    s_and_b32 s61, s18, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s76, s18, 16
+; SI-NEXT:    s_and_b32 s77, s16, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s16, s16, 16
-; SI-NEXT:    v_writelane_b32 v43, s12, 62
+; SI-NEXT:    s_and_b32 s89, s13, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s19, s13, 16
+; SI-NEXT:    s_and_b32 s13, s12, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s88, s12, 16
+; SI-NEXT:    s_and_b32 s60, s11, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s18, s11, 16
-; SI-NEXT:    s_and_b32 s95, s9, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s9, s9, 16
-; SI-NEXT:    s_and_b32 s30, s7, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s17, s7, 16
-; SI-NEXT:    s_and_b32 s34, s10, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s31, s10, 16
-; SI-NEXT:    s_and_b32 s35, s6, 0xffff0000
+; SI-NEXT:    s_and_b32 s23, s10, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s37, s8, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s36, s8, 16
-; SI-NEXT:    s_and_b32 s38, s58, 0xffff0000
+; SI-NEXT:    v_writelane_b32 v42, s7, 54
+; SI-NEXT:    s_and_b32 s72, s58, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s99, s58, 16
-; SI-NEXT:    s_and_b32 s48, s57, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s39, s57, 16
+; SI-NEXT:    s_and_b32 s7, s57, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s57, s57, 16
 ; SI-NEXT:    s_and_b32 s49, s56, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s98, s56, 16
+; SI-NEXT:    s_lshl_b32 s8, s56, 16
 ; SI-NEXT:    s_and_b32 s51, s47, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s50, s47, 16
 ; SI-NEXT:    s_and_b32 s52, s46, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s12, s46, 16
+; SI-NEXT:    s_lshl_b32 s97, s46, 16
 ; SI-NEXT:    s_and_b32 s54, s45, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s53, s45, 16
 ; SI-NEXT:    s_and_b32 s55, s44, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s96, s44, 16
+; SI-NEXT:    s_lshl_b32 s28, s44, 16
 ; SI-NEXT:    s_and_b32 s65, s43, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s64, s43, 16
 ; SI-NEXT:    s_and_b32 s66, s42, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s87, s42, 16
-; SI-NEXT:    s_and_b32 s45, s41, 0xffff0000
+; SI-NEXT:    s_and_b32 s68, s41, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s67, s41, 16
-; SI-NEXT:    s_and_b32 s57, s40, 0xffff0000
+; SI-NEXT:    s_and_b32 s69, s40, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s86, s40, 16
-; SI-NEXT:    s_and_b32 s13, s15, 0xffff0000
+; SI-NEXT:    s_and_b32 s62, s15, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s70, s15, 16
 ; SI-NEXT:    s_and_b32 s80, s14, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s85, s14, 16
-; SI-NEXT:    s_and_b32 s82, s5, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s25, s5, 16
-; SI-NEXT:    s_and_b32 s28, s4, 0xffff0000
+; SI-NEXT:    s_and_b32 s92, s5, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s11, s5, 16
+; SI-NEXT:    s_and_b32 s83, s4, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s84, s4, 16
-; SI-NEXT:    v_writelane_b32 v43, s9, 63
+; SI-NEXT:    v_writelane_b32 v42, s7, 55
 ; SI-NEXT:  .LBB89_3: ; %end
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s27
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -158449,136 +158410,134 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s72
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s29
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s73
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s90
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s74
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s75
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s25
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s21
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s78
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s75
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s22
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s79
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s61
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s77
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s76
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
-; SI-NEXT:    v_readlane_b32 s4, v43, 56
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s77
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s16
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
-; SI-NEXT:    v_readlane_b32 s4, v43, 57
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT:    v_readlane_b32 s4, v43, 58
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s63
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s79
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
-; SI-NEXT:    v_readlane_b32 s4, v43, 59
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s89
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s19
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
-; SI-NEXT:    v_readlane_b32 s4, v43, 60
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT:    v_readlane_b32 s4, v43, 61
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s13
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s88
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
-; SI-NEXT:    v_readlane_b32 s4, v43, 62
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s60
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s18
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s95
-; SI-NEXT:    v_readlane_b32 s4, v43, 63
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s73
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s59
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
+; SI-NEXT:    v_readlane_b32 s4, v42, 50
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s30
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s17
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s34
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s23
+; SI-NEXT:    v_readlane_b32 s4, v42, 51
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s31
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; SI-NEXT:    v_readlane_b32 s4, v42, 52
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s6
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
+; SI-NEXT:    v_readlane_b32 s4, v42, 53
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s37
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT:    v_readlane_b32 s4, v42, 54
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s36
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s38
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s72
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s99
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
+; SI-NEXT:    v_readlane_b32 s4, v42, 55
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s48
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s4
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s39
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s57
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s49
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s98
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s8
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -158592,7 +158551,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s52
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s12
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s97
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -158606,7 +158565,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s55
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s96
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s28
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -158625,21 +158584,21 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s45
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s68
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s67
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s57
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s69
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s86
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s13
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s62
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s70
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
@@ -158653,14 +158612,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s82
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s92
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s11
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s83
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s84
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
@@ -158707,118 +158666,109 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB89_4:
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; kill: killed $sgpr8
+; SI-NEXT:    s_mov_b32 s7, s6
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    s_mov_b32 s89, s76
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    s_mov_b32 s88, s57
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    s_mov_b32 s94, s26
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    s_mov_b32 s93, s74
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    s_mov_b32 s90, s63
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    s_mov_b32 s91, s77
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    v_readlane_b32 s61, v44, 31
-; SI-NEXT:    v_readlane_b32 s74, v44, 23
-; SI-NEXT:    v_readlane_b32 s60, v44, 30
-; SI-NEXT:    v_readlane_b32 s63, v44, 22
-; SI-NEXT:    v_readlane_b32 s8, v44, 15
-; SI-NEXT:    v_readlane_b32 s46, v44, 27
-; SI-NEXT:    v_readlane_b32 s59, v44, 28
-; SI-NEXT:    v_readlane_b32 s68, v44, 29
-; SI-NEXT:    s_mov_b32 s92, s56
-; SI-NEXT:    v_readlane_b32 s69, v44, 26
-; SI-NEXT:    s_mov_b32 s76, s58
-; SI-NEXT:    v_readlane_b32 s44, v44, 11
-; SI-NEXT:    v_readlane_b32 s42, v44, 34
-; SI-NEXT:    v_readlane_b32 s58, v44, 24
-; SI-NEXT:    v_readlane_b32 s40, v44, 25
-; SI-NEXT:    v_readlane_b32 s41, v44, 19
-; SI-NEXT:    v_readlane_b32 s56, v44, 32
-; SI-NEXT:    v_readlane_b32 s47, v44, 33
-; SI-NEXT:    v_readlane_b32 s14, v44, 18
-; SI-NEXT:    v_readlane_b32 s11, v44, 10
-; SI-NEXT:    v_readlane_b32 s9, v44, 16
-; SI-NEXT:    v_readlane_b32 s10, v44, 17
-; SI-NEXT:    v_readlane_b32 s15, v44, 14
-; SI-NEXT:    v_readlane_b32 s7, v44, 9
-; SI-NEXT:    v_readlane_b32 s62, v44, 8
-; SI-NEXT:    v_readlane_b32 s97, v44, 12
-; SI-NEXT:    v_readlane_b32 s83, v44, 13
-; SI-NEXT:    v_readlane_b32 s81, v44, 20
-; SI-NEXT:    v_readlane_b32 s71, v44, 21
+; SI-NEXT:    ; kill: killed $sgpr8
+; SI-NEXT:    v_readlane_b32 s58, v43, 19
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    s_mov_b32 s95, s47
+; SI-NEXT:    s_mov_b32 s94, s21
+; SI-NEXT:    s_mov_b32 s93, s61
+; SI-NEXT:    s_mov_b32 s34, s73
+; SI-NEXT:    s_mov_b32 s91, s75
+; SI-NEXT:    v_readlane_b32 s56, v43, 10
+; SI-NEXT:    s_mov_b32 s36, s63
+; SI-NEXT:    s_mov_b32 s38, s59
+; SI-NEXT:    s_mov_b32 s37, s42
+; SI-NEXT:    v_readlane_b32 s30, v43, 17
+; SI-NEXT:    v_readlane_b32 s98, v43, 6
+; SI-NEXT:    s_mov_b32 s46, s45
+; SI-NEXT:    s_mov_b32 s31, s43
+; SI-NEXT:    s_mov_b32 s78, s40
+; SI-NEXT:    v_readlane_b32 s15, v43, 14
+; SI-NEXT:    s_mov_b32 s39, s57
+; SI-NEXT:    s_mov_b32 s48, s13
+; SI-NEXT:    v_readlane_b32 s41, v43, 13
+; SI-NEXT:    v_readlane_b32 s44, v43, 5
+; SI-NEXT:    v_readlane_b32 s9, v43, 11
+; SI-NEXT:    v_readlane_b32 s14, v43, 12
+; SI-NEXT:    v_readlane_b32 s81, v43, 9
+; SI-NEXT:    v_readlane_b32 s10, v43, 16
+; SI-NEXT:    v_readlane_b32 s12, v43, 4
+; SI-NEXT:    v_readlane_b32 s96, v43, 7
+; SI-NEXT:    v_readlane_b32 s82, v43, 8
+; SI-NEXT:    v_readlane_b32 s71, v43, 15
 ; SI-NEXT:    ; kill: killed $sgpr6
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; kill: killed $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    ; implicit-def: $sgpr27
 ; SI-NEXT:    ; implicit-def: $sgpr20
 ; SI-NEXT:    ; implicit-def: $sgpr24
 ; SI-NEXT:    ; implicit-def: $sgpr29
-; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $sgpr23
-; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr35
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr90
 ; SI-NEXT:    ; implicit-def: $sgpr21
-; SI-NEXT:    ; implicit-def: $sgpr75
+; SI-NEXT:    ; implicit-def: $sgpr25
 ; SI-NEXT:    ; implicit-def: $sgpr22
-; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr75
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr16
 ; SI-NEXT:    ; implicit-def: $sgpr77
 ; SI-NEXT:    ; implicit-def: $sgpr79
-; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr63
 ; SI-NEXT:    ; implicit-def: $sgpr19
+; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr18
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $sgpr95
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr59
+; SI-NEXT:    ; implicit-def: $sgpr73
 ; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; implicit-def: $sgpr30
-; SI-NEXT:    ; implicit-def: $sgpr31
-; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; kill: killed $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr23
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $sgpr35
-; SI-NEXT:    ; implicit-def: $sgpr36
-; SI-NEXT:    ; implicit-def: $sgpr37
 ; SI-NEXT:    ; implicit-def: $sgpr99
-; SI-NEXT:    ; implicit-def: $sgpr38
-; SI-NEXT:    ; implicit-def: $sgpr39
-; SI-NEXT:    ; implicit-def: $sgpr48
-; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; kill: killed $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr49
 ; SI-NEXT:    ; implicit-def: $sgpr50
 ; SI-NEXT:    ; implicit-def: $sgpr51
-; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr97
 ; SI-NEXT:    ; implicit-def: $sgpr52
 ; SI-NEXT:    ; implicit-def: $sgpr53
 ; SI-NEXT:    ; implicit-def: $sgpr54
-; SI-NEXT:    ; implicit-def: $sgpr96
+; SI-NEXT:    ; implicit-def: $sgpr28
 ; SI-NEXT:    ; implicit-def: $sgpr55
 ; SI-NEXT:    ; implicit-def: $sgpr64
 ; SI-NEXT:    ; implicit-def: $sgpr65
 ; SI-NEXT:    ; implicit-def: $sgpr87
 ; SI-NEXT:    ; implicit-def: $sgpr66
 ; SI-NEXT:    ; implicit-def: $sgpr67
-; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $sgpr68
 ; SI-NEXT:    ; implicit-def: $sgpr86
-; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; implicit-def: $sgpr69
 ; SI-NEXT:    ; implicit-def: $sgpr70
-; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr62
 ; SI-NEXT:    ; implicit-def: $sgpr85
 ; SI-NEXT:    ; implicit-def: $sgpr80
-; SI-NEXT:    ; implicit-def: $sgpr25
-; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr92
 ; SI-NEXT:    ; implicit-def: $sgpr84
-; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr83
 ; SI-NEXT:    s_branch .LBB89_2
 ;
 ; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar:
@@ -158881,14 +158831,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:168
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:176
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v40, 8, v27
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
@@ -158900,42 +158849,46 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v29
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v27
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v29
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v10
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v12
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v10
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v24
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v12
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v26
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v28
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v30
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v32
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v34
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v35
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v36
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v37
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v38
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:184
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:192
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:200
@@ -158944,37 +158897,34 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:232
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v26
-; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v28
+; VI-NEXT:    v_lshlrev_b32_e32 v45, 8, v22
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
-; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b32_e32 v24, 8, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v24, 8, v2
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -158993,11 +158943,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:328
@@ -159006,8 +158951,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:36
+; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
+; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 8, v0
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:44
@@ -159016,45 +158965,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:116
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:172
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:180
 ; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:196
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:212
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:260
 ; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:268
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:324
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:308
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
@@ -159064,50 +159015,46 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB89_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -159124,10 +159071,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -159151,7 +159099,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v17, v10
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
@@ -159168,43 +159115,38 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v40, v42
-; VI-NEXT:    v_mov_b32_e32 v42, v44
-; VI-NEXT:    v_mov_b32_e32 v44, v45
-; VI-NEXT:    v_mov_b32_e32 v45, v62
-; VI-NEXT:    v_or_b32_sdwa v2, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v3, v53, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v34, v24
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -159212,74 +159154,77 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v0, v48, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v45, v62
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v32, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v54, v22
+; VI-NEXT:    v_mov_b32_e32 v41, v24
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v33, v0
+; VI-NEXT:    v_mov_b32_e32 v34, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v37, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v50, v26
+; VI-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v55, v26
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v39, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v49, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v43, v27
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v51, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v53, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v52, v28
+; VI-NEXT:    v_mov_b32_e32 v35, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v53, v28
 ; VI-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v1, v47, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v55, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v33, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v35, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v41, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v63, v27
-; VI-NEXT:    v_mov_b32_e32 v46, v57
+; VI-NEXT:    v_mov_b32_e32 v36, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v36, v0
+; VI-NEXT:    v_mov_b32_e32 v56, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v56, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v61, v59
+; VI-NEXT:    v_mov_b32_e32 v58, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v61, v60
+; VI-NEXT:    v_mov_b32_e32 v60, v59
 ; VI-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -159291,53 +159236,55 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v1, v44, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v58, v0
+; VI-NEXT:    v_mov_b32_e32 v50, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v43, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v52, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v0, v59, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v60, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v46, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v54, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v63, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v47, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_mov_b32_e32 v57, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
@@ -159369,10 +159316,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    s_cbranch_execnz .LBB89_3
 ; VI-NEXT:  .LBB89_2: ; %cmp.true
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v59
+; VI-NEXT:    v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
@@ -159391,147 +159340,165 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    s_lshl_b32 s9, s19, 8
 ; VI-NEXT:    s_add_i32 s16, s16, 3
 ; VI-NEXT:    s_lshl_b32 s10, s17, 8
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v28, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v59
-; VI-NEXT:    v_or_b32_sdwa v25, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v62
-; VI-NEXT:    v_or_b32_sdwa v27, v63, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v45
-; VI-NEXT:    v_or_b32_sdwa v52, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v44
-; VI-NEXT:    v_or_b32_sdwa v26, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v45
+; VI-NEXT:    v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v42
-; VI-NEXT:    v_or_b32_sdwa v63, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v40
-; VI-NEXT:    v_or_b32_sdwa v43, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v61
+; VI-NEXT:    v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v60
 ; VI-NEXT:    v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v46
-; VI-NEXT:    v_or_b32_sdwa v24, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v61
+; VI-NEXT:    v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v48, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v48, vcc, 0x300, v48
 ; VI-NEXT:    v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 0x3000000, v24
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v38, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v38, vcc, 0x300, v38
 ; VI-NEXT:    v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x3000000, v23
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v22, v34, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v36, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v50, vcc, 0x300, v36
+; VI-NEXT:    v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v50, vcc, 0x300, v50
 ; VI-NEXT:    v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x3000000, v22
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v21, v53, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v53, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v49, vcc, 0x300, v49
 ; VI-NEXT:    v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v20, vcc, 0x3000000, v20
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v37, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v37, vcc, 0x300, v37
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT:    v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x3000000, v19
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v58, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x300, v58
-; VI-NEXT:    v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x300, v57
+; VI-NEXT:    v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x3000000, v18
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x300, v34
 ; VI-NEXT:    v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x3000000, v14
@@ -159540,78 +159507,67 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:    v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v35, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x300, v35
-; VI-NEXT:    v_or_b32_sdwa v13, v13, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x300, v25
-; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x300, v59
-; VI-NEXT:    v_or_b32_sdwa v25, v43, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x300, v36
+; VI-NEXT:    v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x300, v26
+; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x300, v52
+; VI-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v52, vcc, 0x300, v54
+; VI-NEXT:    v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x3000000, v13
-; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x3000000, v25
-; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x3000000, v28
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x3000000, v21
+; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x3000000, v26
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT:    v_or_b32_sdwa v30, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x300, v51
 ; VI-NEXT:    v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x300, v1
-; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x300, v63
-; VI-NEXT:    v_or_b32_sdwa v26, v26, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x300, v59
+; VI-NEXT:    v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x3000000, v12
-; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x3000000, v26
-; VI-NEXT:    v_add_u32_e32 v30, vcc, 0x3000000, v30
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x3000000, v25
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v57, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT:    v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT:    v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x300, v2
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -159635,14 +159591,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_add_u32_e32 v41, vcc, 0x300, v10
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x300, v55
 ; VI-NEXT:    v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v39, vcc, 0x300, v52
-; VI-NEXT:    v_add_u32_e32 v52, vcc, 0x300, v53
-; VI-NEXT:    v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_or_b32_sdwa v27, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v39, vcc, 0x300, v53
+; VI-NEXT:    v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x3000000, v9
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x3000000, v10
-; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x3000000, v21
 ; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x3000000, v27
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x3000000, v28
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x3000000, v29
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -159658,14 +159615,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_add_u32_e32 v42, vcc, 0x300, v42
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v42, vcc, 0x300, v11
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x300, v40
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x300, v1
+; VI-NEXT:    v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x3000000, v8
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x3000000, v11
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v30, vcc, 0x3000000, v30
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
@@ -159705,29 +159666,19 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT:    v_or_b32_sdwa v29, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v46, vcc, 0x300, v46
 ; VI-NEXT:    v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x3000000, v5
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x300, v2
-; VI-NEXT:    v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x3000000, v29
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v4
 ; VI-NEXT:    v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v47, vcc, 3, v32
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x300, v4
 ; VI-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x3000000, v4
@@ -159794,38 +159745,35 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB89_4:
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v61, v59
-; VI-NEXT:    v_mov_b32_e32 v46, v57
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v40, v42
-; VI-NEXT:    v_mov_b32_e32 v42, v44
-; VI-NEXT:    v_mov_b32_e32 v44, v45
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v61, v60
+; VI-NEXT:    v_mov_b32_e32 v60, v59
 ; VI-NEXT:    v_mov_b32_e32 v45, v62
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v47, v4
-; VI-NEXT:    v_mov_b32_e32 v54, v3
-; VI-NEXT:    v_mov_b32_e32 v52, v28
-; VI-NEXT:    v_mov_b32_e32 v63, v27
-; VI-NEXT:    v_mov_b32_e32 v50, v26
-; VI-NEXT:    v_mov_b32_e32 v34, v24
+; VI-NEXT:    v_mov_b32_e32 v63, v3
+; VI-NEXT:    v_mov_b32_e32 v53, v28
+; VI-NEXT:    v_mov_b32_e32 v43, v27
+; VI-NEXT:    v_mov_b32_e32 v55, v26
+; VI-NEXT:    v_mov_b32_e32 v41, v24
+; VI-NEXT:    v_mov_b32_e32 v54, v22
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_branch .LBB89_2
@@ -159887,18 +159835,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 8, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 8, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 8, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 8, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 8, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 8, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 8, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
 ; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v43
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
@@ -159927,10 +159875,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v52
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v51
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
@@ -159942,7 +159890,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v39
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
@@ -159990,7 +159938,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
@@ -160017,23 +159965,23 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v15
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
-; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
@@ -160046,49 +159994,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
 ; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:92
 ; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:100
 ; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:140
 ; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:156
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(21)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
 ; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:276
 ; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:284
 ; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:316
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
@@ -160099,54 +160046,55 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(28)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(36)
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(38)
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
@@ -160156,7 +160104,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB89_2
@@ -160169,7 +160117,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -160206,10 +160154,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
@@ -160225,13 +160173,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v9, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -160239,7 +160187,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -160280,8 +160228,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v52, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v1, 16, v0
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v50, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -160299,16 +160247,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v48, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v17, v17, 16, v1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_mov_b32_e32 v33, v46
+; GFX9-NEXT:    v_mov_b32_e32 v33, v45
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
@@ -160321,7 +160269,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -160330,7 +160278,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -160338,122 +160286,121 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_mov_b32_e32 v40, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v46, v0
+; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v35, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v35, v45
+; GFX9-NEXT:    v_mov_b32_e32 v45, v61
+; GFX9-NEXT:    v_mov_b32_e32 v61, v42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v38, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
+; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v54, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_mov_b32_e32 v41, v57
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v60, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_or_b32_sdwa v1, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v57, v59
+; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v56, v42
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB89_3
 ; GFX9-NEXT:  .LBB89_2:
 ; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v33, v46
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v33, v45
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v56, v61
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:  .LBB89_3: ; %Flow
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -160656,7 +160603,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -160716,11 +160663,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT:    v_or_b32_sdwa v48, v40, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT:    v_or_b32_sdwa v49, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v26
@@ -160755,7 +160702,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_add_u32_e32 v24, 3, v24
-; GFX9-NEXT:    v_add_u32_e32 v26, 3, v62
+; GFX9-NEXT:    v_add_u32_e32 v26, 3, v61
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v36, 0x300, v24
 ; GFX9-NEXT:    v_add_u32_e32 v24, 0x300, v48
@@ -160764,7 +160711,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v26, 3, v61
+; GFX9-NEXT:    v_add_u32_e32 v26, 3, v45
 ; GFX9-NEXT:    v_add_u32_e32 v20, 3, v20
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v34, 0x300, v20
@@ -160773,7 +160720,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v56
 ; GFX9-NEXT:    v_add_u32_e32 v21, 3, v21
-; GFX9-NEXT:    v_or_b32_sdwa v21, v45, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v28, 0x300, v21
 ; GFX9-NEXT:    v_add_u32_e32 v21, 0x300, v54
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff, v21
@@ -162617,30 +162564,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:136
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:72
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:72
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:92
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -162729,31 +162675,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v32
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v34
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v37
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v38
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v39
+; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v36
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v50
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v29
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v55
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v29
+; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v35
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v40
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v54
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v41
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v43
+; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v37
+; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v41
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v44
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v47
-; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v60
+; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v56
+; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v39
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v63
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88
@@ -162764,21 +162710,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:116
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
-; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v42
-; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v61
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; kill: killed $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v34
-; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v35
-; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v36
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v38
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v48
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v52
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v54
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v45
-; SI-NEXT:    ; kill: killed $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v58
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; kill: killed $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v48
+; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v49
+; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v52
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v53
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v55
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v40
+; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v44
+; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v63
+; SI-NEXT:    ; kill: killed $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr35
@@ -162802,30 +162748,30 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    ; implicit-def: $vgpr20
 ; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; kill: killed $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; kill: killed $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v1
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v2
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v3
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v4
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v7
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; kill: killed $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v5
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v8
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v10
 ; SI-NEXT:    ; kill: killed $vgpr7
@@ -162837,7 +162783,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; kill: killed $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v1
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -163010,9 +162956,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v3
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr3
@@ -163025,10 +162971,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(4)
@@ -163058,7 +163004,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v26, v1, v2, 16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v63
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v58
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_alignbit_b32 v40, v1, v2, 16
@@ -163070,7 +163016,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v23, v1, v2, 16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v33
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_alignbit_b32 v54, v1, v2, 16
@@ -163082,7 +163028,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v18, v1, v2, 16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v47
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_alignbit_b32 v52, v1, v2, 16
@@ -163091,148 +163037,88 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v15, v1, v16, 16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v15, v1, v57, 16
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v57
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v50, v1, v2, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v58
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v42
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v13, v1, v59, 16
-; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    v_alignbit_b32 v13, v1, v61, 16
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v42
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_alignbit_b32 v48, v1, v2, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v49
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v59
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v11, v1, v51, 16
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v49
+; SI-NEXT:    v_alignbit_b32 v11, v1, v60, 16
+; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_alignbit_b32 v37, v1, v2, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v59
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_alignbit_b32 v37, v1, v2, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v53
-; SI-NEXT:    v_alignbit_b32 v8, v1, v46, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v49
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v8, v1, v51, 16
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v51
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v49
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_alignbit_b32 v34, v1, v2, 16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v55
-; SI-NEXT:    v_alignbit_b32 v5, v1, v6, 16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v46
+; SI-NEXT:    v_alignbit_b32 v5, v1, v53, 16
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v51
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_alignbit_b32 v31, v1, v2, 16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
 ; SI-NEXT:    v_alignbit_b32 v4, v1, v9, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v62
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v63
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; kill: killed $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_alignbit_b32 v28, v1, v2, 16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v41
-; SI-NEXT:    v_alignbit_b32 v3, v1, v43, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v57
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v56
+; SI-NEXT:    v_alignbit_b32 v3, v1, v41, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v45
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_alignbit_b32 v25, v1, v2, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
-; SI-NEXT:    v_alignbit_b32 v2, v1, v36, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v63
+; SI-NEXT:    v_alignbit_b32 v2, v1, v16, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
-; SI-NEXT:    v_alignbit_b32 v22, v1, v6, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v56
-; SI-NEXT:    v_alignbit_b32 v20, v6, v7, 16
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v62
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; SI-NEXT:    v_alignbit_b32 v1, v1, v60, 16
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; kill: killed $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_alignbit_b32 v17, v6, v7, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v47
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v14, v6, v39, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v42
-; SI-NEXT:    v_alignbit_b32 v10, v6, v44, 16
-; SI-NEXT:    v_mov_b32_e32 v63, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v61
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v27
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v16
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v58
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v49
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v51
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v53
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v12
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v57
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v24
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v56
+; SI-NEXT:    v_alignbit_b32 v22, v1, v7, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; SI-NEXT:    v_alignbit_b32 v20, v7, v9, 16
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v6
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v42
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v44
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v40, v38, 24
@@ -163335,6 +163221,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v20, v5, 24
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v20, v5, 16
@@ -163342,33 +163231,49 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v20, v5, 8
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_alignbit_b32 v17, v7, v9, 16
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_alignbit_b32 v6, v17, v4, 24
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v17, v4, 16
+; SI-NEXT:    v_alignbit_b32 v14, v7, v27, 16
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v17, v4, 8
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v14, v3, 24
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v36
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v14, v3, 16
+; SI-NEXT:    v_alignbit_b32 v10, v7, v39, 16
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v14, v3, 8
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v10, v2, 24
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v30
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v44
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v10, v2, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v1, v43, 16
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v7, v45, 16
+; SI-NEXT:    v_alignbit_b32 v7, v7, v55, 16
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v6, v10, v2, 8
@@ -163473,135 +163378,173 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v58
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v33
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v47
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v57
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v42
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v59
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v49
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v51
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v62
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v45
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v24
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 24, v62
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 24, v47
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 24, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 24, v36
+; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr24
 ; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; kill: killed $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; kill: killed $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr55
 ; SI-NEXT:  .LBB90_2: ; %Flow
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB90_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v53
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v46
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_alignbit_b32 v8, v11, v8, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v39
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v47
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v49
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v51
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_alignbit_b32 v11, v14, v11, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v19
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v14, v14, v13, 16
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v58
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v21
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v43
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v63
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v36
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v41
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v56
 ; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v43
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v41
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v33
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v30
 ; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v45
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v55
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v30
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v46
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v55
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v53
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v7, v6, v7, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_alignbit_b32 v5, v8, v5, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v7, v8, v7, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v44
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v42
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v6
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v39
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v8
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v36
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v49
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v8
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v51
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_alignbit_b32 v8, v11, v8, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v16
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v11, v10, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v10, v63, v10, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 24, v19
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v27
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v11
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v12
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v59
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v11
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v60
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_alignbit_b32 v11, v14, v11, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v14, v14, v13, 16
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v42
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 24, v16
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -163610,41 +163553,39 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v13
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v62
-; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v13
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v59
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v19
+; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v13
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v61
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_alignbit_b32 v13, v17, v13, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
 ; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v17, v17, v15, 16
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
 ; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_alignbit_b32 v23, v26, v23, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT:    v_alignbit_b32 v29, v32, v29, 16
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v35, 0xffff0000, v35
 ; SI-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
@@ -163653,6 +163594,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v38, 0xffff0000, v38
 ; SI-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
 ; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 24, v19
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_and_b32_e32 v49, 0xffff0000, v49
 ; SI-NEXT:    s_waitcnt vmcnt(5)
@@ -163671,14 +163613,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
 ; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v51
 ; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v53
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v15
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v56
-; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v15
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v55
 ; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v41
 ; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
@@ -163686,80 +163620,91 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
-; SI-NEXT:    v_alignbit_b32 v20, v16, v18, 16
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v18
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v15
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v57
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_alignbit_b32 v15, v20, v15, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v6
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v20, v20, v18, 16
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v6
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v18
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v18
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v18, v22, v18, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_alignbit_b32 v18, v23, v18, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v22, v23, v22, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, v22, v16, 16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 24, v21
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v23
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v45
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v23
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v57
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v16
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; SI-NEXT:    v_alignbit_b32 v23, v26, v23, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v25, v26, v25, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v24
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v26
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v62
 ; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v26
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT:    v_alignbit_b32 v26, v28, v26, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v12
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v28, v28, v27, 16
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 24, v12
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 24, v16
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 24, v24
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v26, v29, v26, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v28, v29, v28, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 24, v30
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v27
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 24, v27
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v29
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v29
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT:    v_alignbit_b32 v29, v32, v29, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v31, v32, v31, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 24, v27
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 24, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 24, v30
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; SI-NEXT:    v_add_f32_e32 v34, 0x40c00000, v32
@@ -163779,6 +163724,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v33
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v35, 0xffff0000, v35
 ; SI-NEXT:    v_add_f32_e32 v37, 0x40c00000, v35
@@ -164040,8 +163987,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB90_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v38
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
@@ -164495,7 +164442,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, 0xff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v47
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
@@ -164528,7 +164475,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v61
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v33
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v4
@@ -164559,17 +164506,17 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v58
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v63
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
@@ -164599,7 +164546,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -164650,65 +164597,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; kill: killed $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; kill: killed $vgpr34
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr63
-; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr60
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; implicit-def: $vgpr42
@@ -164721,13 +164615,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr46
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr36
 ; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    ; implicit-def: $vgpr62
 ; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr63
 ; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr52
@@ -164830,115 +164722,173 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    ; kill: killed $vgpr33
 ; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB90_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v29
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v28
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v28
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v27
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v27
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v26
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v26
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v25
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v25
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v6
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v6
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v21
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v21
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v20
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v20
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v3
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v19
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v18
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
 ; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v1
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v17
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[13:14]
@@ -164963,15 +164913,17 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[31:32]
-; VI-NEXT:    v_lshrrev_b64 v[46:47], 24, v[29:30]
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v31
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v12
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v45, v46
+; VI-NEXT:    v_lshrrev_b64 v[46:47], 24, v[29:30]
+; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v31
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v46, v51
-; VI-NEXT:    v_mov_b32_e32 v45, v50
+; VI-NEXT:    v_mov_b32_e32 v46, v63
+; VI-NEXT:    v_mov_b32_e32 v63, v50
 ; VI-NEXT:    v_lshrrev_b64 v[50:51], 24, v[27:28]
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v9
@@ -164993,8 +164945,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[17:18]
 ; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v40, 24, v10
@@ -165003,29 +164954,30 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 24, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v6
 ; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v31
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 24, v30
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 16, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v62, 8, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v53, 24, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v17
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 24, v30
+; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v30
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 8, v30
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; VI-NEXT:    v_mov_b32_e32 v47, v34
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v2
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB90_2: ; %Flow
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v34, v36
+; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB90_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_lshlrev_b32_e32 v33, 16, v18
 ; VI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_bfe_u32 v34, v33, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v34, vcc, v34, v33
 ; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x7fff, v34
@@ -165644,112 +165596,109 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v28
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v28
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v28
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v27
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v27
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v26
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v26
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v26
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v25
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v25
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v6
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v6
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v21
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v21
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v20
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v20
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v20
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v19
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v3
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v19
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v18
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v18
 ; VI-NEXT:    v_lshrrev_b64 v[56:57], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v17
+; VI-NEXT:    v_lshrrev_b32_e32 v45, 24, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v11
@@ -165763,63 +165712,69 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 24, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v6
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshrrev_b32_e32 v44, 8, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v31
-; VI-NEXT:    v_lshrrev_b32_e32 v45, 8, v31
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 24, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 16, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v62, 8, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v53, 24, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v17
+; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v31
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 24, v30
+; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v30
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 8, v30
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v29
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v17
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB90_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v63
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v44
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v43
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v43
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v43, v36, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v43, v44, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -165832,28 +165787,22 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v38
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v54
 ; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -165901,14 +165850,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v49
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v45
 ; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -165968,34 +165913,48 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v38
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v55
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v53
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v62
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -166006,10 +165965,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -166019,9 +165978,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -166032,10 +165991,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -166045,9 +166004,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -166058,8 +166017,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v56
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -166069,9 +166028,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -166082,10 +166041,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
@@ -166096,9 +166055,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -166109,34 +166068,26 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x6c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
 ; VI-NEXT:    v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v34
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v53
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v61
 ; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x74, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v45
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v63
 ; VI-NEXT:    v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
@@ -166189,716 +166140,714 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v46, v15
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    v_mov_b32_e32 v36, v15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    v_mov_b32_e32 v37, v16
-; GFX9-NEXT:    ; kill: killed $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr57
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; implicit-def: $vgpr54
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:    ; implicit-def: $vgpr42
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    v_mov_b32_e32 v47, v16
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr36
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; kill: killed $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
+; GFX9-NEXT:    ; implicit-def: $vgpr38
+; GFX9-NEXT:    ; implicit-def: $vgpr51
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    ; implicit-def: $vgpr48
+; GFX9-NEXT:    ; implicit-def: $vgpr49
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
+; GFX9-NEXT:    ; implicit-def: $vgpr16
+; GFX9-NEXT:    ; implicit-def: $vgpr42
+; GFX9-NEXT:    ; implicit-def: $vgpr39
+; GFX9-NEXT:    ; implicit-def: $vgpr45
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr49
-; GFX9-NEXT:    ; implicit-def: $vgpr55
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    ; implicit-def: $vgpr38
-; GFX9-NEXT:    ; implicit-def: $vgpr35
+; GFX9-NEXT:    ; implicit-def: $vgpr54
+; GFX9-NEXT:    ; implicit-def: $vgpr62
+; GFX9-NEXT:    ; implicit-def: $vgpr61
 ; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr45
-; GFX9-NEXT:    ; implicit-def: $vgpr56
+; GFX9-NEXT:    ; implicit-def: $vgpr35
+; GFX9-NEXT:    ; implicit-def: $vgpr44
+; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    ; kill: killed $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr55
+; GFX9-NEXT:    ; implicit-def: $vgpr56
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    ; implicit-def: $vgpr51
-; GFX9-NEXT:    ; implicit-def: $vgpr62
+; GFX9-NEXT:    ; kill: killed $vgpr36
+; GFX9-NEXT:    ; implicit-def: $vgpr37
+; GFX9-NEXT:    ; implicit-def: $vgpr57
+; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr63
+; GFX9-NEXT:    ; kill: killed $vgpr59
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr47
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr39
-; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    ; implicit-def: $vgpr61
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(33)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; kill: killed $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB90_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v37
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v37
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v36
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v32
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v47
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v47
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v47
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v46
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v46
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v14
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v14
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v13
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v12
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v12
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v10
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v10
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v9
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v8
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 24, v6
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v18
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v8
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v8
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v7
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v6
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[46:47]
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v4
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v32
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v32
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(59)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v31
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v31
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v30
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v30
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v30
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v29
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v28
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v28
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v28
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v27
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v26
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v26
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v25
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v25
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v24
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v24
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v23
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v22
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v22
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v21
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v21
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v20
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[36:37]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[7:8]
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[3:4]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[1:2]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[21:22]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[19:20]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v36
-; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 24, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 8, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v46, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 24, v32
-; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 16, v29
-; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 16, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 24, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 24, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 8, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 24, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 8, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v17
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[5:6]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[11:12]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[9:10]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[7:8]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[5:6]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[3:4]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[1:2]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[31:32]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[29:30]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[27:28]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[25:26]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[23:24]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[21:22]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[19:20]
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[17:18]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 8, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 8, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 8, v17
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:  .LBB90_2: ; %Flow
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v58, v57
+; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB90_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v18
-; GFX9-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT:    v_bfe_u32 v34, v33, 16, 1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v18
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX9-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX9-NEXT:    v_add3_u32 v34, v34, v33, s6
-; GFX9-NEXT:    v_or_b32_e32 v35, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v39, v34, v35, vcc
-; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s6
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX9-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v62, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s6
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v18
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v33, v16, v33, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v18, vcc
 ; GFX9-NEXT:    s_mov_b32 s7, 0x7060302
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v18, v33, vcc
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_perm_b32 v15, v17, v15, s7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v63, v18, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_perm_b32 v16, v62, v39, s7
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v19
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v20, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_perm_b32 v16, v15, v63, s7
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v20, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_perm_b32 v15, v17, v15, s7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v22
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v43, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_perm_b32 v16, v16, v15, s7
-; GFX9-NEXT:    v_perm_b32 v15, v17, v43, s7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v24
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v38, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v23
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT:    v_perm_b32 v15, v17, v22, s7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v26
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v25
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v50, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_perm_b32 v16, v24, v38, s7
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT:    v_perm_b32 v15, v17, v50, s7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v27
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v51, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_perm_b32 v16, v26, v23, s7
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_perm_b32 v34, v15, v33, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v17
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v16, v18, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_perm_b32 v33, v15, v18, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v20
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v20
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT:    v_perm_b32 v15, v17, v51, s7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v30
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v29
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v30, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v48, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_perm_b32 v16, v28, v25, s7
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_perm_b32 v18, v15, v18, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v19
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v19
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    v_perm_b32 v17, v15, v20, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v22
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT:    v_perm_b32 v15, v17, v48, s7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v32
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    s_waitcnt vmcnt(47)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v32, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v49, v18, v19, vcc
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_perm_b32 v16, v30, v27, s7
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v22
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_perm_b32 v18, v15, v18, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v21
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v21
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v61, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    v_perm_b32 v17, v15, v61, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc
-; GFX9-NEXT:    v_perm_b32 v15, v17, v49, s7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v24
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_perm_b32 v59, v15, v18, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v23
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    v_perm_b32 v58, v15, v18, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v26
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v26
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_perm_b32 v63, v15, v18, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v25
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v25
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    v_perm_b32 v62, v15, v18, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v28
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v28
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v60, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v27
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v28, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v27
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    v_perm_b32 v56, v15, v26, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v30
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v30
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v29
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v30, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v29
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    v_perm_b32 v33, v15, v25, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v32
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v32
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(51)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v31
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v32, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v31
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v16, v17, vcc
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX9-NEXT:    v_perm_b32 v35, v15, v24, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
-; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v18, v19, vcc
-; GFX9-NEXT:    v_add3_u32 v17, v17, v2, s6
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v2
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_bfe_u32 v15, v2, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v16, v17, vcc
+; GFX9-NEXT:    v_add3_u32 v15, v15, v2, s6
+; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v17, v18, vcc
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v15, v16, vcc
+; GFX9-NEXT:    v_bfe_u32 v15, v2, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT:    v_add3_u32 v17, v17, v2, s6
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v2
+; GFX9-NEXT:    v_add3_u32 v15, v15, v2, s6
+; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v52, v17, v18, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v15, v16, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s6
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX9-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v17, vcc
-; GFX9-NEXT:    v_perm_b32 v60, v1, v52, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v15, vcc
+; GFX9-NEXT:    v_perm_b32 v37, v1, v23, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s6
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX9-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v2, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v2, v15, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s6
 ; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
@@ -166912,13 +166861,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v53, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v2, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s6
 ; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v54, v1, v53, s7
+; GFX9-NEXT:    v_perm_b32 v48, v1, v20, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
@@ -166927,6 +166876,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_cndmask_b32_e32 v18, v2, v3, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s6
@@ -166947,7 +166897,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v33, v1, v17, s7
+; GFX9-NEXT:    v_perm_b32 v50, v1, v17, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v8
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
@@ -166976,7 +166926,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v7, vcc
-; GFX9-NEXT:    v_perm_b32 v56, v1, v4, s7
+; GFX9-NEXT:    v_perm_b32 v52, v1, v4, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v10
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
@@ -166995,515 +166945,480 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s6
-; GFX9-NEXT:    v_or_b32_e32 v42, 0x400000, v1
+; GFX9-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v9
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v42, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v15, vcc
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s6
-; GFX9-NEXT:    v_or_b32_e32 v42, 0x400000, v1
+; GFX9-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v42, vcc
-; GFX9-NEXT:    v_perm_b32 v40, v1, v3, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v15, vcc
+; GFX9-NEXT:    v_perm_b32 v39, v1, v3, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v12
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s6
-; GFX9-NEXT:    v_or_b32_e32 v44, 0x400000, v1
+; GFX9-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v12
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; GFX9-NEXT:    v_bfe_u32 v12, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v44, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc
 ; GFX9-NEXT:    v_add3_u32 v12, v12, v1, s6
-; GFX9-NEXT:    v_or_b32_e32 v44, 0x400000, v1
+; GFX9-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v44, vcc
-; GFX9-NEXT:    v_bfe_u32 v44, v1, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
+; GFX9-NEXT:    v_bfe_u32 v15, v1, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_add3_u32 v44, v44, v1, s6
-; GFX9-NEXT:    v_or_b32_e32 v46, 0x400000, v1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v1, s6
+; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v44, v46, vcc
-; GFX9-NEXT:    v_bfe_u32 v44, v11, 16, 1
-; GFX9-NEXT:    v_add3_u32 v44, v44, v11, s6
-; GFX9-NEXT:    v_or_b32_e32 v46, 0x400000, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v15, v16, vcc
+; GFX9-NEXT:    v_bfe_u32 v15, v11, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v11, s6
+; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v44, v46, vcc
-; GFX9-NEXT:    v_perm_b32 v44, v11, v2, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v15, v16, vcc
+; GFX9-NEXT:    v_perm_b32 v54, v11, v2, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
 ; GFX9-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX9-NEXT:    v_bfe_u32 v46, v11, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v15, v11, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_add3_u32 v46, v46, v11, s6
-; GFX9-NEXT:    v_or_b32_e32 v47, 0x400000, v11
+; GFX9-NEXT:    v_add3_u32 v15, v15, v11, s6
+; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v46, v47, vcc
-; GFX9-NEXT:    v_bfe_u32 v46, v14, 16, 1
-; GFX9-NEXT:    v_add3_u32 v46, v46, v14, s6
-; GFX9-NEXT:    v_or_b32_e32 v47, 0x400000, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v15, v16, vcc
+; GFX9-NEXT:    v_bfe_u32 v15, v14, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v14, s6
+; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v46, v47, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 16, v13
-; GFX9-NEXT:    v_add_f32_e32 v46, 0x40c00000, v46
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v13
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_bfe_u32 v58, v46, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
 ; GFX9-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX9-NEXT:    v_add3_u32 v58, v58, v46, s6
-; GFX9-NEXT:    v_or_b32_e32 v59, 0x400000, v46
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v46, v46
-; GFX9-NEXT:    v_bfe_u32 v46, v13, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v58, v59, vcc
-; GFX9-NEXT:    v_add3_u32 v46, v46, v13, s6
-; GFX9-NEXT:    v_or_b32_e32 v58, 0x400000, v13
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v41, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v16, v41, vcc
+; GFX9-NEXT:    v_add3_u32 v15, v15, v13, s6
+; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v46, v58, vcc
-; GFX9-NEXT:    v_perm_b32 v16, v32, v29, s7
-; GFX9-NEXT:    v_perm_b32 v46, v13, v1, s7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v37
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v15, v16, vcc
+; GFX9-NEXT:    v_perm_b32 v41, v13, v1, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v47
 ; GFX9-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX9-NEXT:    v_bfe_u32 v58, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v37
-; GFX9-NEXT:    v_add3_u32 v58, v58, v13, s6
-; GFX9-NEXT:    v_or_b32_e32 v59, 0x400000, v13
+; GFX9-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v13, s6
+; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v58, v59, vcc
-; GFX9-NEXT:    v_bfe_u32 v58, v16, 16, 1
-; GFX9-NEXT:    v_add3_u32 v58, v58, v16, s6
-; GFX9-NEXT:    v_or_b32_e32 v59, 0x400000, v16
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v58, v59, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v58, 16, v36
-; GFX9-NEXT:    v_add_f32_e32 v58, 0x40c00000, v58
-; GFX9-NEXT:    v_bfe_u32 v35, v58, 16, 1
-; GFX9-NEXT:    v_add3_u32 v35, v35, v58, s6
-; GFX9-NEXT:    v_or_b32_e32 v15, 0x400000, v58
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v58, v58
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v35, v15, vcc
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v36
-; GFX9-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX9-NEXT:    v_bfe_u32 v36, v35, 16, 1
-; GFX9-NEXT:    v_add3_u32 v36, v36, v35, s6
-; GFX9-NEXT:    v_or_b32_e32 v37, 0x400000, v35
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v35, v35
-; GFX9-NEXT:    v_cndmask_b32_e32 v35, v36, v37, vcc
-; GFX9-NEXT:    v_perm_b32 v58, v35, v15, s7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v32
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v30
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v28
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v24
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; GFX9-NEXT:    v_perm_b32 v59, v16, v13, s7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v15, v16, vcc
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v47
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v16, v16, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v44, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v44, v16, v44, vcc
+; GFX9-NEXT:    v_perm_b32 v16, v44, v13, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 16, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v14
-; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
+; GFX9-NEXT:    v_perm_b32 v53, v8, v5, s7
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX9-NEXT:    v_perm_b32 v55, v19, v20, s7
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_perm_b32 v57, v8, v5, s7
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v17
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; GFX9-NEXT:    v_perm_b32 v47, v14, v11, s7
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_perm_b32 v45, v12, v9, s7
-; GFX9-NEXT:    v_perm_b32 v41, v10, v7, s7
-; GFX9-NEXT:    v_perm_b32 v34, v6, v18, s7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v38
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v39
-; GFX9-NEXT:    v_mov_b32_e32 v38, v54
-; GFX9-NEXT:    v_perm_b32 v61, v21, v31, s7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v43
-; GFX9-NEXT:    v_mov_b32_e32 v43, v34
-; GFX9-NEXT:    v_mov_b32_e32 v39, v55
-; GFX9-NEXT:    v_mov_b32_e32 v42, v33
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[38:39]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v30
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v28
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_perm_b32 v34, v30, v27, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
+; GFX9-NEXT:    v_perm_b32 v36, v32, v29, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v25
+; GFX9-NEXT:    v_perm_b32 v38, v22, v31, s7
+; GFX9-NEXT:    v_perm_b32 v42, v14, v11, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v31
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v49
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v48
-; GFX9-NEXT:    v_mov_b32_e32 v48, v60
-; GFX9-NEXT:    v_mov_b32_e32 v49, v61
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v25
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v51
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v50
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; GFX9-NEXT:    v_perm_b32 v55, v12, v9, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v53
-; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 8, v44
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v52
-; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v41
-; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 16, v42
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v20
+; GFX9-NEXT:    v_perm_b32 v49, v19, v21, s7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v46
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_bfe_u32 v45, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v45, v45, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v43, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v46
+; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v43, v45, v43, vcc
+; GFX9-NEXT:    v_bfe_u32 v45, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v45, v45, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v46, 0x400000, v15
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v45, v46, vcc
+; GFX9-NEXT:    v_perm_b32 v15, v15, v43, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 16, v44
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v46, 16, v43
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[15:16]
+; GFX9-NEXT:    v_perm_b32 v51, v6, v18, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
+; GFX9-NEXT:    v_perm_b32 v40, v10, v7, s7
+; GFX9-NEXT:    v_perm_b32 v57, v28, v60, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v63
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v60
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v61
+; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v39
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v15
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v15
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v62
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[58:59]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[46:47]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[44:45]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[40:41]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v17
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v17
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[56:57]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[41:42]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[48:49]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[54:55]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[42:43]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 24, v59
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[61:62]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[39:40]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[52:53]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v62
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[33:34]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[50:51]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[48:49]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[33:34]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[37:38]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[35:36]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[33:34]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[33:34]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[56:57]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[33:34]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[62:63]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[58:59]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[33:34]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[60:61]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[33:34]
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[43:44]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[50:51]
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[43:44], 24, v[43:44]
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v59
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v58
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 24, v47
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v47
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v46
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v46
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v45
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v44
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 24, v41
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v40
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v40
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 24, v57
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 24, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v16
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v57
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v56
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 24, v43
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v43
-; GFX9-NEXT:    v_mov_b32_e32 v43, v38
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 24, v39
-; GFX9-NEXT:    v_lshrrev_b32_e32 v46, 8, v43
-; GFX9-NEXT:    v_mov_b32_e32 v43, v48
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v38
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v43
-; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v61
-; GFX9-NEXT:    v_mov_b32_e32 v43, v61
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v48
-; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v43
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v58
-; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 24, v45
-; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v42
-; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v39
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 24, v62
-; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 16, v56
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 24, v49
-; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v49
-; GFX9-NEXT:    v_mov_b32_e32 v54, v56
-; GFX9-NEXT:    v_mov_b32_e32 v38, v58
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v43
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v44
-; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v44
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v42
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v42
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v41
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v41
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v55
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v55
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v54
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v40
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v39
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v53
+; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v53
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 24, v51
+; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v51
+; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 24, v38
+; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v38
+; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v37
+; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v37
+; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v36
+; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v35
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
+; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v34
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v57
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v57
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v56
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v56
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v63
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v63
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v62
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v50
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v62
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v52
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v50
+; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 24, v49
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v59
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v43
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 24, v44
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v44
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 16, v43
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v43
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 24, v45
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v45
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 16, v44
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v44
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v43
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 24, v44
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 16, v43
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v44
-; GFX9-NEXT:    v_mov_b32_e32 v44, v35
-; GFX9-NEXT:    v_mov_b32_e32 v35, v50
-; GFX9-NEXT:    v_mov_b32_e32 v39, v51
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 16, v33
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v63
-; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 16, v62
-; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v62
-; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v49, v33
-; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 24, v63
-; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 24, v34
-; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 8, v34
-; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v49
-; GFX9-NEXT:    v_mov_b32_e32 v49, v47
-; GFX9-NEXT:    v_mov_b32_e32 v47, v45
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 16, v60
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 24, v61
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 8, v61
-; GFX9-NEXT:    v_mov_b32_e32 v61, v16
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 8, v59
+; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v61
+; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v61
+; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v60
+; GFX9-NEXT:    v_mov_b32_e32 v33, v60
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v40
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v52
+; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v49
+; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 16, v48
+; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
+; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v58
+; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 8, v58
+; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v33
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v61
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v62
+; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v62
+; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 8, v61
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v60
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v60
+; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
+; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 8, v59
 ; GFX9-NEXT:  .LBB90_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v57
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v59
-; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v46
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v42
-; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v41
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v39
+; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v45
+; GFX9-NEXT:    v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v40
+; GFX9-NEXT:    v_or_b32_sdwa v10, v10, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v54
+; GFX9-NEXT:    v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
+; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v51
+; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v48
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v40
+; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v42
+; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v52
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
-; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
-; GFX9-NEXT:    v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v55
-; GFX9-NEXT:    v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v53
-; GFX9-NEXT:    v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
-; GFX9-NEXT:    v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v39, 8, v62
+; GFX9-NEXT:    v_or_b32_sdwa v17, v17, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v39, 8, v60
+; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
+; GFX9-NEXT:    v_or_b32_sdwa v18, v18, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
+; GFX9-NEXT:    v_or_b32_sdwa v12, v12, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
-; GFX9-NEXT:    v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_or_b32_sdwa v16, v47, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
-; GFX9-NEXT:    v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v35
-; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_or_b32_sdwa v16, v36, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v36, 8, v60
-; GFX9-NEXT:    v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v36, 8, v56
-; GFX9-NEXT:    v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v36, 8, v33
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v35, 8, v35
-; GFX9-NEXT:    v_or_b32_sdwa v35, v37, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
+; GFX9-NEXT:    v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v36, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
+; GFX9-NEXT:    v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v39, 8, v51
+; GFX9-NEXT:    v_or_b32_sdwa v38, v38, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v54
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v53
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
+; GFX9-NEXT:    v_or_b32_sdwa v15, v46, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v15
-; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -167513,14 +167428,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -167530,16 +167447,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -167549,15 +167466,18 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v38
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -167565,16 +167485,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -167584,32 +167504,32 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v43
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v35
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -167618,57 +167538,49 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v50
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v44
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v62
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v63
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v43
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v33
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v55
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX9-NEXT:    v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v58
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v50
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v37
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v58
+; GFX9-NEXT:    v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v48
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v63
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
@@ -167676,11 +167588,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -167689,10 +167601,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -167702,21 +167614,23 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -167726,21 +167640,23 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -167750,11 +167666,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -167763,12 +167679,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v61
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -167838,117 +167757,117 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr108_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr107_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr106_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr104_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr95_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr94_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr93_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr90_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr88_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr75_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr77_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr76_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr60_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr63_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr62_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr74_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr57_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr72_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr56_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr89_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr79_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr59_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr60_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr94_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr88_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr77_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr76_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr104_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr74_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr92_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr92_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr79_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr89_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr75_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr73_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr72_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr61_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr59_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr56_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
@@ -167970,185 +167889,183 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v166, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 24, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v76, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v77, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v91, 24, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v94, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 24, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v104, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v43, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 8, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 8, v3
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v108, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 24, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v164, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v180, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v46, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v59, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 24, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v89, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v92, 8, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v183.h, v16.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v165.h, v14.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v150.h, v12.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.h, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.h, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.h, v2.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v47, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v17
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[68:69], 24, v[1:2]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v102.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.h, v30.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.h, v28.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.h, v26.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v24.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v22.h
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v20.h
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v18.h
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v74.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v88.h, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v58.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v79.h, v13.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v43.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v72.h, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v182.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v60.h, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v167.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v42.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v160.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v162.h, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v163.h, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v32.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v116.h, v31.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.h, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.h, v29.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.h, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.h, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.h, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.h, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v180.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v164.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v165.h, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v46.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v178.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v74.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v45.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v42.h, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v89.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v59.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v60.h, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v94.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v77.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v76.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v104.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v91.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v92.h, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.h, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v102.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v116.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.h, v32.h
 ; GFX11-TRUE16-NEXT:  .LBB90_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB90_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v17
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v33, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v18, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v18
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v33, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v18, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v71, v37, v39 :: v_dual_and_b32 v34, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_lshlrev_b32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v55, v37, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v71.h
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v55.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v36, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v50, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v34, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v55, v36, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v34
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v17, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v17
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v48, v34, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v50, v17, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v83, v37, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v37, v51, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v48, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v80.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v18, v49, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v33, v55
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v33, v70
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v17, 0xffff, v34, v17
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v36, v35, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v20, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 24, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v89, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v92, 8, v17
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v17
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v19, 16, v19
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v22, 16, v22
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v34, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v36, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v81.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v19
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v19, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v86, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v83, v33, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v22, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v86.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v83.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v20, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v33, v22, 0x7fff
@@ -168156,267 +168073,264 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v21
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v84, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v20, 0xffff, v34, v70
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v82, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v20, 0xffff, v34, v71
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v84.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v82.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v35, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v47, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v20
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v35, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v19, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_cndmask_b32 v84, v19, v39
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v19, 0xffff, v37, v36
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v24
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v22, 0xffff, v22, v80
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v19
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v46, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v97, v34, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_add_f32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v24, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v22, 0xffff, v22, v84
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v34, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 8, v22
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v24, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v24
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v85.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v23
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v97.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v96, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v21, 0xffff, v35, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_lshlrev_b32 v26, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v33, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v86.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v23, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v37, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v33, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v23, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v37, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v96.h
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v100, v34, v36, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v96, v34, v36, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v21, 0xffff, v35, v21
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v100.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v96.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v24, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v24, 0xffff, v33, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v59, 8, v21
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v24, 0xffff, v33, v87
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v23, 0xffff, v36, v23
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v25
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v26
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v180, 24, v24
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v26, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 8, v23
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v99, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v97, v33, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v98, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v25, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v99.h
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v85, v35, v38 :: v_dual_add_f32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v26, v36, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v103, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v101, v33, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v26, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v36, v26, v38 :: v_dual_and_b32 v35, 0xffff0000, v28
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v28, 16, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v103.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v97.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v36, v26, v38 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v101.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v35, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v28, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v25, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v33, v28, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v28
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v101, v26, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v99, v26, v33, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v34, v85
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v25, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v34, v98
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v100, v25, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v25, 0xffff, v37, v36
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_lshlrev_b32 v30, 16, v30
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v38
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v101.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v99.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v26
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v28, 0xffff, v28, v100
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v27, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v28
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v27, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v28, 0xffff, v28, v87
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 24, v26
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v113, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v112, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v30, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v28
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v30, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v30
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v29
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v164, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v112, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v112.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v27, 0xffff, v35, v27
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v32
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_lshlrev_b32 v32, 16, v32
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v37, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v98, v33, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v112.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v27
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v102, v33, v39 :: v_dual_add_f32 v37, 0x40c00000, v37
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v29
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v30, v37, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v29, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v113.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v115, v34, v36 :: v_dual_add_f32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v103.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v113, v34, v36, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v27, 0xffff, v35, v27
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v115.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v30, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v113.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v30, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v30, 0xffff, v33, v98
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v30, 0xffff, v33, v102
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v29, 0xffff, v36, v29
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v31
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v32, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v32
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v30
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 24, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v29
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v114, v33, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v30
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v114.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v102, v35, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v31, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v115, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v27
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v114.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v116, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v116.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v31, v35, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
-; GFX11-TRUE16-NEXT:    v_add3_u32 v31, v31, v35, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v32, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v31, v31, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v33, v2, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v132, v32, v33 :: v_dual_add_f32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v131, v32, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v38
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v32, 0xffff, v34, v102
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v2.l, v132.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v117, v31, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v32, 0xffff, v34, v115
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v132, v31, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v31, 0xffff, v37, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 24, v32
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v117
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v32
-; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v31
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 24, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 8, v2
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v148, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v2.l, v131.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v32
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v144, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v132
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v35, v38 :: v_dual_add_f32 v36, 0x40c00000, v36
@@ -168428,170 +168342,179 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v37, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v35.l, v148.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v147, v34, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v35.l, v144.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v37, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v35, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v128, v33, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 8, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v149, v33, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v33.l, v147.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v108, 8, v1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v35, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v163, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v108, 8, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v162, v34, v36, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v36.l, v163.h
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v4, v39 :: v_dual_add_f32 v34, 0x40c00000, v38
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v33, v128
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v36.l, v162.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v36, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 24, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v33.l, v147.h
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v38 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 8, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v33, v149
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 24, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 8, v4
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v104, 8, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 8, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v160, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v164, v33, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v131, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v165, v35, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v36, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v36, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v179, v33, v37 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v34.l, v164.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v180, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v35, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v37.l, v179.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v34.l, v160.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v37.l, v180.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v6, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v33, v8, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v167, v6, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v178, v6, v33, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v39
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v34, v131
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v134, v5, v38 :: v_dual_and_b32 v39, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v34, v165
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v179, v5, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v37, v36
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v36, 16, v10
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v33, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v8.l, v167.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v7, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v33, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v134
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v91, 24, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v76, 24, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v42, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v8.l, v178.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 24, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v46, v35, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v36
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v77, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v94, 8, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v179
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 8, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v36
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v42.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v182, v7, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v45, v7, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v39
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v46.h
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v9, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v9, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v10, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v145, v35, v38 :: v_dual_lshlrev_b32 v10, 16, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v42, v35, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v35.l, v182.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v45.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v37, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v35, v42
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v37
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v35, v145
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v37, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v51
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 24, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v12, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v43, v38, v50 :: v_dual_add_f32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v43, 24, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v59, v38, v50, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v48, v12, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v14
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v43.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v12, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v60, v35, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v59.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v74, v35, v49, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v48, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v11
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v10
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v150, v48, v52, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v60, v48, v52 :: v_dual_add_f32 v37, 0x40c00000, v51
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v14, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v7, v150
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v7, v60
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v36, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v74.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v35, v37, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v37
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v60.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 24, v12
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v36, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v12
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v35, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v7, 0x7fff
@@ -168599,75 +168522,73 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v13
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v35, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v36, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v72, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v89, v37, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v48, v35, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v39
-; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v49, v14, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[9:10]
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v165, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v76, v37, v38 :: v_dual_and_b32 v37, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v39 :: v_dual_lshlrev_b32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v49, v14, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v16, 0x40c00000, v16 :: v_dual_add_f32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v49, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v58, v39, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v77, v39, v48, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v49, v7, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v77.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v14, v35 :: v_dual_add_f32 v14, 0x40c00000, v37
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v15, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v58.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v14, v35, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v39, v13, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v13
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v16, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v79, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v94, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v37, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v16, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v74, v13, v49, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v37 :: v_dual_lshlrev_b32 v37, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v14, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v48, v14, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v37, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v37
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v14
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v15, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v37, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v79.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v91, v13, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v50, v15, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v88, v39, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v94.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v104, v39, v51, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v72.h
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v38, v165
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v183, v35, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v89.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v38, v76
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v92, v35, v48, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v88.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v74.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v104.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v91.h
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v39, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v14
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v14
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v35, v183
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v35, v92
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[17:18]
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v15, v13
@@ -168686,27 +168607,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[68:69], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v166, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v7
 ; GFX11-TRUE16-NEXT:  .LBB90_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v148.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v144.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v108.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v117.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v132.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v106.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v128.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v149.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v1.h, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v132.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v131.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v107.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v91.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v78.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.h
@@ -168715,7 +168636,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v163.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v162.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v4, v6
@@ -168724,15 +168645,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v104.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v95.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v95.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v93.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v10
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v5.l, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v179.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v94.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v180.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v90.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
@@ -168743,10 +168664,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v8
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v160.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v164.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v93.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v88.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v165.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v8, v10
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
@@ -168755,8 +168676,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v65.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v42.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v90.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v46.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v75.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
@@ -168764,24 +168685,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v10
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v167.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v77.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v76.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v178.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v179.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v62.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v12
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v60.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v74.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v12
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v14
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v57.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v182.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v62.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v45.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v56.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
@@ -168794,25 +168715,25 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v10, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v72.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v47.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v89.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v40.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v14
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v18
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v20
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v57.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v150.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v44.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v42.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v43.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v60.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v181.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v79.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v181.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v43.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v45.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v94.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v167.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v59.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v182.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
@@ -168824,8 +168745,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v13.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v58.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v178.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v77.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v163.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
@@ -168833,10 +168754,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v20, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v165.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v176.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v88.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v166.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v76.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v104.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v151.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
@@ -168844,24 +168765,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v74.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v183.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v91.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v92.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v135.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v89.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v73.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v92.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v79.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v78.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v72.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v16, v18
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
@@ -168870,24 +168791,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v75.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v61.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v50.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v47.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v58.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v59.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v44.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v18, v9
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
@@ -168898,29 +168819,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v41.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v177.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v56.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v46.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v41.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v183.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v40.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v176.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v180.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v177.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v166.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v161.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
@@ -168933,16 +168854,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v164.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v148.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v24, v14
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v145.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
@@ -168955,16 +168876,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v113.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v130.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v103.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v129.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v18
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
@@ -168979,17 +168900,17 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v128.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v116.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v119.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v119.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v117.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
@@ -169057,132 +168978,128 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x19
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:88
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:84
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:80
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:76
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:72
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:68
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:64
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x15
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:12
 ; GFX11-FAKE16-NEXT:    s_clause 0x2
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr89
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr88
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr78
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr79
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
@@ -169191,867 +169108,863 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB90_2
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[29:30]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[15:16]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[80:81], 24, v[11:12]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[85:86], 24, v[7:8]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[98:99], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[31:32]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[27:28]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 16, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 24, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 24, v10
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v10
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 24, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 24, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 16, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 24, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v76, 24, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v78, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v88, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v89, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v32
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v32
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v32
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v31
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v31
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 24, v28
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v28
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v28
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 24, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 16, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v25
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 24, v24
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 16, v24
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 16, v23
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 24, v22
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v22
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v21
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 24, v20
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 16, v20
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 16, v19
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v18
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v18
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v77, 16, v17
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[13:14]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[81:82], 24, v[9:10]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[86:87], 24, v[5:6]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[99:100], 24, v[1:2]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[25:26]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[21:22]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[19:20]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v76, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[17:18]
 ; GFX11-FAKE16-NEXT:  .LBB90_2: ; %Flow
 ; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB90_4
 ; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v18
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v17
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v17
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v33, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v33
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v33, 0x7fff
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v48, v36, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v35 :: v_dual_add_f32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v36 :: v_dual_and_b32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v18, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v18
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v18, 0x7fff
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v132, v37, v39 :: v_dual_lshlrev_b32 v37, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v77, v37, v39 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v20
+; GFX11-FAKE16-NEXT:    v_perm_b32 v69, v77, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v34
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 24, v69
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v38, v34, 0x7fff
-; GFX11-FAKE16-NEXT:    v_perm_b32 v34, v132, v17, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v132
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v35, v38, v18 :: v_dual_add_f32 v18, 0x40c00000, v37
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v19
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v18, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v33, v39 :: v_dual_add_f32 v20, 0x40c00000, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v18
-; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v69
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v38, v18 :: v_dual_add_f32 v20, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v48, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v18, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v19
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_perm_b32 v68, v34, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v35, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v38, 0x40c00000, v19 :: v_dual_lshlrev_b32 v37, 16, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v20
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v37, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v36, v39, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v38, v20, 0x7fff
-; GFX11-FAKE16-NEXT:    v_perm_b32 v33, v33, v35, 0x7060302
-; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v49, v37, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v22
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v48, 16, 1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v38, v19 :: v_dual_and_b32 v22, 0xffff0000, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v37
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v36, v48, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v48
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 16, v19
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 8, v34
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v20, v38, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v49
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v21
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v77, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v33
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v39, v50, vcc_lo
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v20, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v68
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 8, v68
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v34, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v38, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v65, v19, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v39, v36, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 24, v65
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v20, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v20
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-FAKE16-NEXT:    v_perm_b32 v38, v19, v18, 0x7060302
-; GFX11-FAKE16-NEXT:    v_perm_b32 v37, v37, v36, 0x7060302
-; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v20, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v36
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 24, v38
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v38
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v39, v50, vcc_lo
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v51, 0x40c00000, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v24
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 16, v37
+; GFX11-FAKE16-NEXT:    v_perm_b32 v64, v35, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 8, v65
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v36, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v39, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 16, v64
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v64
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[64:65]
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v22, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v22
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v51, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v48, v37, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v39, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v35, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v49, v22, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v37
-; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v51, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v49, v21, vcc_lo
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v48, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v48
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v53, v21, v20, 0x7060302
-; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v52, v48, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v51
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v21
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v22, v49, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v50
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v71, v21, v20, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v21
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v35, v22, v35 :: v_dual_add_f32 v22, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v48
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v22, 16, 1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v39, v39, v52 :: v_dual_add_f32 v50, 0x40c00000, v50
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, 0x400000, v22
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v49, v22, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v52, v39, v48, 0x7060302
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v24, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v70, v36, v35, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v37, v39 :: v_dual_lshlrev_b32 v39, 16, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v24, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v24
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v50, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v49, v51, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v24, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v54, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v55, v50, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v26
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v64, 0x400000, v54
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v39, v23, vcc_lo
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v50
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v49, v54, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v52
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 16, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v24, v39, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v55
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v24, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v51, v64, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v49, v38, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v48, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v23, v36, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 16, v70
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v24, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v25
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v24
-; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v50, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v81, v23, v22, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v24
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_perm_b32 v69, v39, v49, 0x7060302
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v25
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v51
-; GFX11-FAKE16-NEXT:    v_perm_b32 v70, v23, v22, 0x7060302
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v50, v54, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v49
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v55, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v65, 0x400000, v55
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v64, v51, 16, 1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 16, v69
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v54, v50, v55, 0x7fff
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v38, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v80, v37, v36, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v49, 0x40c00000, v25
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v38, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v26, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v26, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v26, v64, v51, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v25, v39, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v51
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v49, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v26, v50, v39, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v49
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v37, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v38, v49, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 16, v80
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v26, v37 :: v_dual_add_f32 v26, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 16, v25
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v26, v39, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v64
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v26, 16, 1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v39, v54, v65 :: v_dual_lshlrev_b32 v54, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v27
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v26
-; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v51, v26, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v83, v39, v50, 0x7060302
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v28, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v54
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v38, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v26, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v82, v38, v37, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v27
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v28
-; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v28, 0x7fff
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v65, v54, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v39, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v38, v28, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v64, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v64
-; GFX11-FAKE16-NEXT:    v_add3_u32 v28, v65, v54, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v30
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v39, v27, vcc_lo
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v54
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-FAKE16-NEXT:    v_add3_u32 v55, v51, v64, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v28, v51, v48, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v38, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v50, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-FAKE16-NEXT:    v_perm_b32 v84, v25, v24, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v27
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v51, v28, v39 :: v_dual_add_f32 v28, 0x40c00000, v65
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 24, v84
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v54, v28, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v55, v66, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v29
+; GFX11-FAKE16-NEXT:    v_perm_b32 v83, v25, v24, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v28, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v49
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v49, 16, v29
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v64, 0x400000, v28
-; GFX11-FAKE16-NEXT:    v_add3_u32 v54, v54, v28, 0x7fff
-; GFX11-FAKE16-NEXT:    v_perm_b32 v96, v39, v51, 0x7060302
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v30, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v51, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v49, 0x40c00000, v49
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v65, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v48, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v84, v39, v38, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v29
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v49, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v48, v50, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v30, 0x7fff
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v55, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v54, v64, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v54, v65, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v65
-; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v66, v55, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v51, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v52, v49, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v29, v39, v29 :: v_dual_lshlrev_b32 v66, 16, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v55
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-FAKE16-NEXT:    v_add3_u32 v64, v54, v65, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v51
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v39, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v49
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v48, v51, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX11-FAKE16-NEXT:    v_perm_b32 v97, v27, v26, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v50
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v30, v39, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v66
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v51
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 24, v83
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v39, v30, v39 :: v_dual_add_f32 v30, 0x40c00000, v50
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v30, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v64, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v65, 0x400000, v30
-; GFX11-FAKE16-NEXT:    v_add3_u32 v55, v55, v30, 0x7fff
-; GFX11-FAKE16-NEXT:    v_perm_b32 v101, v39, v54, 0x7060302
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v32, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v64
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v48, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v50
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v66, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v49, v30, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v86, v48, v39, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v31
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v32
-; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v32, 0x7fff
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v67, v64, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v55, v65, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v53, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v49, v51, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v48, v32, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v66, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v66
-; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v67, v64, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v39, v31, vcc_lo
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v64
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-FAKE16-NEXT:    v_add3_u32 v65, v55, v66, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v53, v50, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v48, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v49, v52, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-FAKE16-NEXT:    v_perm_b32 v102, v29, v28, 0x7060302
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v32, v39, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v67
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[101:102]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[96:97]
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v65, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v64, v32, 16, 1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_perm_b32 v112, v39, v55, 0x7060302
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v2, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v64, v64, v32, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v65, 0x40c00000, v65
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[83:84]
-; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v85, v27, v26, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v32, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v51
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v71, v65, 16, 1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v84
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v64, v66, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v51
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v50, v32, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v96, v49, v48, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v51, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v50, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v49, v2, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v64, 0x400000, v65
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v67, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v39, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v71, v65, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v32, v67, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v71, 0x400000, v67
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v53, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v51
+; GFX11-FAKE16-NEXT:    v_perm_b32 v87, v29, v28, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v49, v54, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v55, v51, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v32, v53, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v53
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    v_perm_b32 v115, v2, v1, 0x7060302
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v39, v64, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v68
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v49, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v49, 0x40c00000, v54
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v65, v39, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v66, v71, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v49, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v52, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v39
-; GFX11-FAKE16-NEXT:    v_add3_u32 v65, v65, v39, 0x7fff
-; GFX11-FAKE16-NEXT:    v_perm_b32 v114, v64, v32, 0x7060302
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v64, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v66, 0x40c00000, v66
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v49
+; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v51, v49, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v98, v50, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v52
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_add3_u32 v64, v64, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v71, v66, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v65, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v50, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v51, v53, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v65, v3, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v66
-; GFX11-FAKE16-NEXT:    v_perm_b32 v113, v31, v30, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v76, 24, v115
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v64, v68, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add3_u32 v64, v71, v66, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-FAKE16-NEXT:    v_add3_u32 v65, v65, v3, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v71, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 16, v98
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v76, 8, v98
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v50, v54, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v55, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v51, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v66, 0x40c00000, v68
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v64, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v54
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v50, v53, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v67, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v66
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v65, v71, vcc_lo
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v65, v66, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v67
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v53
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-FAKE16-NEXT:    v_perm_b32 v116, v3, v64, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v100, v3, v50, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v65, v65, v66, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v71, 0x400000, v6
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v80, v67, 16, 1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v51, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v53, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v5, 16, 1
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v6, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v65, v68, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v54, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v80, v67, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v67
-; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v66, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v71, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v8
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v66, v53, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v53
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v52, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
-; GFX11-FAKE16-NEXT:    v_perm_b32 v119, v3, v65, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 16, v3
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v67, v6, v68 :: v_dual_add_f32 v6, 0x40c00000, v71
+; GFX11-FAKE16-NEXT:    v_perm_b32 v97, v31, v30, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v103, v3, v51, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v7
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v71, 0x400000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v66, v80, vcc_lo
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v6, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v68, 0x40c00000, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v52, v66, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v54
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-FAKE16-NEXT:    v_perm_b32 v118, v5, v67, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v102, v5, v53, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v66, v6, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v8
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v81, v68, 16, 1
-; GFX11-FAKE16-NEXT:    v_perm_b32 v117, v4, v39, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v52, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v67, v54, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v99, v2, v1, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v8, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v66, v71, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v52, v55, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v7, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v81, v68, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v71, 0x400000, v68
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v81, 0x400000, v7
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v80 :: v_dual_lshlrev_b32 v80, 16, v10
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
-; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v66, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v67, v54, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v66, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v52, v7, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v8, v71, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v80
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v8, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v66
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v71, 16, v9
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v66, v81, vcc_lo
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v8, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v71, 0x40c00000, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v52, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; GFX11-FAKE16-NEXT:    v_perm_b32 v44, v7, v68, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v182, v7, v54, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v10, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v66, v8, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v81, 0x400000, v10
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v82, v71, 16, 1
-; GFX11-FAKE16-NEXT:    v_perm_b32 v45, v5, v6, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v52, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v112, v55, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v183, v5, v6, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v10, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v66, v80, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v52, v66, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v9, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v82, v71, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v71
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v81, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
-; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v66, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v112, v55, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v52, v9, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v64
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v71, v10, v80 :: v_dual_add_f32 v10, 0x40c00000, v81
+; GFX11-FAKE16-NEXT:    v_perm_b32 v101, v4, v49, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 16, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v80, v10, 16, 1
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v66, v82 :: v_dual_lshlrev_b32 v66, 16, v11
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v81, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v49
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v67, v12, 16, 1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v182, v9, v71, 0x7060302
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v66
-; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v80, v10, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v10
-; GFX11-FAKE16-NEXT:    v_add3_u32 v81, v81, v12, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v12
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v85, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v176, v9, v55, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v52
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v66, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v67, v67, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v113, v9, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v66, v80
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v9
-; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v85, v9, 0x7fff
-; GFX11-FAKE16-NEXT:    v_perm_b32 v183, v7, v8, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v81, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v82, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v113, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v177, v7, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v112, v11, 16, 1
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v18
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v35
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v66, v80 :: v_dual_add_f32 v66, 0x40c00000, v81
-; GFX11-FAKE16-NEXT:    v_add3_u32 v80, v82, v11, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v81, 0x400000, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67
+; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v112, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v13
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v85, v66, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v113, v52, 16, 1
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[112:113]
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v80, v81 :: v_dual_add_f32 v80, 0x40c00000, v82
-; GFX11-FAKE16-NEXT:    v_add3_u32 v81, v85, v66, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v66
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v85, v14, 16, 1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v86, v80, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112
+; GFX11-FAKE16-NEXT:    v_add3_u32 v67, v113, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v113, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v114, v66, 16, 1
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v87, 0x400000, v80
-; GFX11-FAKE16-NEXT:    v_perm_b32 v176, v11, v9, 0x7060302
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v66, v81, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add3_u32 v81, v85, v14, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v14
-; GFX11-FAKE16-NEXT:    v_add3_u32 v85, v86, v80, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v115, 0x400000, v66
+; GFX11-FAKE16-NEXT:    v_perm_b32 v162, v11, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v67, v112, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v67, v113, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_add3_u32 v113, v114, v66, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v16
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v98, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v116, v13, 16, 1
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v81, v82, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v81, 0x40c00000, v86
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
-; GFX11-FAKE16-NEXT:    v_add3_u32 v82, v98, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v20
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT:    v_add3_u32 v112, v116, v13, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-FAKE16-NEXT:    v_perm_b32 v163, v14, v66, 0x7060302
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v86, v81, 16, 1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v85, v87, vcc_lo
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v85, 0x400000, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v149, v14, v52, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v114, v67, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v66, v113, v115, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v113, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v15
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-FAKE16-NEXT:    v_perm_b32 v177, v12, v10, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v82, v85 :: v_dual_add_f32 v82, 0x40c00000, v87
-; GFX11-FAKE16-NEXT:    v_add3_u32 v85, v86, v81, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v86, 0x400000, v81
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v87, v16, 16, 1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-FAKE16-NEXT:    v_perm_b32 v163, v12, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115
+; GFX11-FAKE16-NEXT:    v_add3_u32 v113, v114, v67, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v114, 0x400000, v67
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v115, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v98, v82, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v99, 0x400000, v82
-; GFX11-FAKE16-NEXT:    v_perm_b32 v162, v13, v80, 0x7060302
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v85, v86, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add3_u32 v86, v87, v16, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v87, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v116, v112, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v117, 0x400000, v112
+; GFX11-FAKE16-NEXT:    v_perm_b32 v148, v13, v66, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v113, v114, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v114, v115, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v115, 0x400000, v16
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v85, v15, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v98, v98, v82, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v100, 0x400000, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v80
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v86, v87, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-FAKE16-NEXT:    v_add3_u32 v85, v85, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v113, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v116, v116, v112, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v118, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v33
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v114, v115, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v112, v112
+; GFX11-FAKE16-NEXT:    v_add3_u32 v113, v113, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[96:97]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[86:87]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v135, v16, v67, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v112, v116, v117, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v52
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[84:85]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v113, v118, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX11-FAKE16-NEXT:    v_perm_b32 v149, v16, v81, 0x7060302
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v98, v99, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v81
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v65
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[80:81], 24, v[176:177]
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v85, v100, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[85:86], 24, v[44:45]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[98:99], 24, v[116:117]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v66
-; GFX11-FAKE16-NEXT:    v_perm_b32 v148, v15, v82, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v82
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v71
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v68
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[148:149]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v67
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v39
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v31
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[162:163]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[81:82], 24, v[182:183]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[86:87], 24, v[118:119]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[99:100], 24, v[114:115]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[69:70]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[52:53]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[37:38]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v34
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[33:34]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v134, v15, v112, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v112
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v51
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[134:135]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[148:149]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[162:163]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[176:177]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v53
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[182:183]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[82:83]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v67
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v66
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v54
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[102:103]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[98:99]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[80:81]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[100:101]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[70:71]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[68:69]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v135
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v135
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v134
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v134
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v149
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v149
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 16, v148
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v148
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v163
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v163
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v162
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 16, v162
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v162
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 24, v177
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 8, v177
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 24, v177
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v177
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v176
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v176
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 24, v183
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 8, v183
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v176
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v183
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 8, v183
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v182
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v182
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 24, v45
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v45
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v44
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v44
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 24, v119
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v119
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 16, v118
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 8, v118
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 24, v117
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 8, v117
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 16, v116
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v116
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v78, 8, v115
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v88, 16, v114
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v89, 8, v114
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v113
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v113
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v112
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v112
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v102
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v102
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 16, v101
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v101
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 24, v97
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 24, v103
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v103
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 16, v102
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v102
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 24, v101
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v101
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 16, v100
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v100
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v99
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v99
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v97
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v97
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v96
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v96
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v96
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v83
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v87
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v87
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 16, v86
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v86
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v85
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v84
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v84
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v83
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 24, v70
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 16, v82
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v82
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 24, v81
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v81
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v80
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 24, v71
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v71
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v69
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 24, v53
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v53
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 8, v52
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v77
 ; GFX11-FAKE16-NEXT:  .LBB90_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v89
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v76
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v66
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v76
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v63
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v75
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xff, v88
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v75
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v73
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v60
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v37, v34
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v78
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v166
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, v55, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v73
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v66, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v56
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xff, v180
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v166
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v39
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, v37, v51
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v98
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v55, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v62
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v52
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, v66, v51
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v72
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v62
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v34
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, v55, v66
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v59
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v58
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v160
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v65, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v45
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v44
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v39
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, v55, v66
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v56
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v161
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v47
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v65, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v147
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v41
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v39
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, v55, v66
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v45
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v43
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v85
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, v55, v66
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v42
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v146
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v41
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v34
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v51
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v183
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, v55, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v55, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v183
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v181
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v81
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v55, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v180
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xff, v132
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v179
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v53
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v68, 8, v179
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v54, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xff, v167
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v177
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v165
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, v55, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v68
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v34
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xff, v133
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v51
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v178
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, v54, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v51, 0xff, v119
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v164
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v176
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v167
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v68, 8, v80
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v162
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xff, v161
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v50
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v80, 8, v164
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xff, v131
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v163
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, v37, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v55
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, v66, v68
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v80
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v81, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v150
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v118
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v149
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, v51, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, v54, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v65, v66
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
 ; GFX11-FAKE16-NEXT:    s_clause 0x1
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v34
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v10, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v11, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v12, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v10, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v11, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v12, v52
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 8, v162
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v151
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v67
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 8, v148
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v145
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v49
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 8, v150
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v129
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v149
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 8, v144
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v116
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v135
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v148
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v134
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v145
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v131
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v48
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v144
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v103
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v135
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v129
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v79
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v77
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v33, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v72
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v67
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v34, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v39, v48
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
@@ -170068,35 +169981,35 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v74
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v132
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v61
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v59
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v63
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v61
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v47
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v64
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v60
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v46
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v160
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v146
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v43
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v46
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v40
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v38
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v147
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v44
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v182
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v33, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v38, v39
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
@@ -170113,35 +170026,35 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v40
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v178
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v37
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v134
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v182
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v176
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v83
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v82
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v177
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v163
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v130
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v165
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v117
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v151
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v96
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v119
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v103
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v35
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v33, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v36, v35
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
@@ -170158,31 +170071,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v97
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v128
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v118
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v115
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v102
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v101
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v117
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v101
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v34
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v87
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v71
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v116
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v100
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v112
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v115
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v30, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v30, 8, v33
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v113
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v114
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v112
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v98
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v26
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v27, v28
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v29, v30
@@ -170210,33 +170123,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x19
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:64
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:68
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:72
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:76
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:80
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:84
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:88
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    s_clause 0x15
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:96
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -170297,484 +170206,464 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:72
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v1
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v6
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v13
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v14
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v22
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v18
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v30
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v2
 ; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v8
 ; SI-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v10
 ; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v14
-; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v19
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v20
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v19
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v24
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v26
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v28
+; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v23
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v28
 ; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v29
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s16
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v29
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s18
 ; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s23
 ; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s22
 ; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s25
 ; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s24
 ; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s28
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v32
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v33
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v32
+; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v34
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v35
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v38
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v34
+; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v35
 ; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v36
 ; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v37
-; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v39
+; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v38
+; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v39
 ; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v48
 ; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v49
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v50
-; SI-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v54
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v51
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v52
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v53
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v55
+; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v53
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v40
+; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v54
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v41
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v55
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v42
+; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v40
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v43
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v42, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s27
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v41
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v42
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v43
+; SI-NEXT:    v_mul_f32_e64 v39, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v49, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s29
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB91_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v36, v1, v3, 16
+; SI-NEXT:    v_mov_b32_e32 v43, v36
+; SI-NEXT:    v_alignbit_b32 v36, v1, v2, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_alignbit_b32 v33, v1, v6, 16
+; SI-NEXT:    v_alignbit_b32 v6, v1, v6, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT:    v_mov_b32_e32 v41, v28
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v28, v1, v13, 16
+; SI-NEXT:    v_alignbit_b32 v2, v1, v13, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; SI-NEXT:    v_mov_b32_e32 v40, v25
-; SI-NEXT:    v_alignbit_b32 v25, v1, v19, 16
+; SI-NEXT:    v_alignbit_b32 v5, v1, v17, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; SI-NEXT:    v_alignbit_b32 v53, v1, v44, 16
+; SI-NEXT:    v_alignbit_b32 v4, v1, v3, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v44
+; SI-NEXT:    v_alignbit_b32 v3, v1, v7, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v47
-; SI-NEXT:    v_alignbit_b32 v19, v1, v7, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v57
-; SI-NEXT:    v_alignbit_b32 v57, v1, v58, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v62
-; SI-NEXT:    v_alignbit_b32 v13, v1, v56, 16
+; SI-NEXT:    v_alignbit_b32 v16, v1, v57, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; SI-NEXT:    v_alignbit_b32 v10, v1, v2, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
-; SI-NEXT:    v_alignbit_b32 v7, v1, v23, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v15
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v15, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v42
-; SI-NEXT:    v_alignbit_b32 v44, v2, v8, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v44, v36, 24
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v13, v1, v58, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v62
+; SI-NEXT:    v_alignbit_b32 v10, v1, v60, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
+; SI-NEXT:    v_alignbit_b32 v44, v19, v8, 16
+; SI-NEXT:    v_alignbit_b32 v7, v1, v22, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
+; SI-NEXT:    v_alignbit_b32 v8, v44, v36, 24
+; SI-NEXT:    v_alignbit_b32 v60, v1, v27, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v44, v36, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v44, v36, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v39
+; SI-NEXT:    v_alignbit_b32 v57, v1, v30, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v35
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
-; SI-NEXT:    v_alignbit_b32 v58, v2, v9, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v44, v36, 8
+; SI-NEXT:    v_alignbit_b32 v58, v22, v9, 16
+; SI-NEXT:    v_alignbit_b32 v40, v1, v37, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v58, v33, 24
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v58, v6, 24
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v58, v33, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v58, v6, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v49
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v58, v33, 8
-; SI-NEXT:    v_alignbit_b32 v47, v23, v12, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v58, v6, 8
+; SI-NEXT:    v_alignbit_b32 v47, v25, v12, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v47, v28, 24
-; SI-NEXT:    v_alignbit_b32 v6, v1, v27, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v47, v2, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v47, v28, 16
-; SI-NEXT:    v_alignbit_b32 v5, v1, v32, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v35
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v47, v2, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v47, v28, 8
-; SI-NEXT:    v_alignbit_b32 v4, v1, v37, 16
+; SI-NEXT:    v_alignbit_b32 v8, v47, v2, 8
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v38
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v59
-; SI-NEXT:    v_alignbit_b32 v3, v1, v48, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v46
+; SI-NEXT:    v_alignbit_b32 v53, v1, v48, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v50
-; SI-NEXT:    v_alignbit_b32 v50, v2, v46, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v50, v8, v59, 16
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v50, v25, 24
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v50, v25, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v50, v5, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v50, v25, 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v56, v1, v51, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v52
-; SI-NEXT:    v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v45
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v15
-; SI-NEXT:    v_alignbit_b32 v35, v22, v17, 16
-; SI-NEXT:    v_alignbit_b32 v8, v35, v13, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v50, v5, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v35, v13, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v17, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v29
-; SI-NEXT:    v_mov_b32_e32 v32, v49
+; SI-NEXT:    v_alignbit_b32 v8, v50, v5, 8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v52, v1, v52, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v55
+; SI-NEXT:    v_mov_b32_e32 v17, v63
+; SI-NEXT:    v_alignbit_b32 v1, v1, v41, 16
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    v_alignbit_b32 v52, v44, v36, 8
-; SI-NEXT:    v_alignbit_b32 v46, v35, v13, 8
-; SI-NEXT:    v_mov_b32_e32 v48, v34
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v11
-; SI-NEXT:    v_alignbit_b32 v62, v2, v61, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v62, v53, 24
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v31
+; SI-NEXT:    v_alignbit_b32 v62, v8, v61, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v62, v53, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v62, v4, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v62, v53, 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v62, v4, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v60
-; SI-NEXT:    v_alignbit_b32 v55, v2, v63, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v56
+; SI-NEXT:    v_alignbit_b32 v55, v8, v63, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v55, v19, 24
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v55, v19, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v55, v19, 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v48, v62, v4, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v31
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v37
+; SI-NEXT:    v_alignbit_b32 v38, v8, v45, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v11
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v38, v16, 8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v62
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; SI-NEXT:    v_alignbit_b32 v35, v8, v18, 16
+; SI-NEXT:    v_mov_b32_e32 v45, v8
+; SI-NEXT:    v_alignbit_b32 v8, v35, v13, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v8, v35, v13, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v29, v35, v13, 8
+; SI-NEXT:    v_alignbit_b32 v61, v38, v16, 24
+; SI-NEXT:    v_alignbit_b32 v41, v38, v16, 16
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v59
 ; SI-NEXT:    v_alignbit_b32 v30, v8, v21, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v30, v10, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v30, v10, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v30, v10, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v24
-; SI-NEXT:    v_alignbit_b32 v27, v8, v26, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
+; SI-NEXT:    v_alignbit_b32 v27, v8, v24, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v27, v7, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v27, v7, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v27, v7, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 8, v44
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_alignbit_b32 v38, v51, v2, 16
-; SI-NEXT:    v_alignbit_b32 v2, v38, v57, 24
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v38, v57, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v38, v57, 8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; SI-NEXT:    v_alignbit_b32 v37, v8, v18, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v37, v6, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v37, v6, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v37, v6, 8
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v17
-; SI-NEXT:    v_alignbit_b32 v21, v8, v29, 16
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
+; SI-NEXT:    v_alignbit_b32 v24, v8, v26, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v21, v5, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v24, v60, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v21, v5, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v24, v60, 8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v21, v5, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v18, v9
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v29
-; SI-NEXT:    v_alignbit_b32 v61, v8, v34, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v32
+; SI-NEXT:    v_alignbit_b32 v21, v8, v14, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v61, v4, 24
+; SI-NEXT:    v_alignbit_b32 v8, v21, v57, 24
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v61, v4, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v21, v57, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v61, v4, 8
+; SI-NEXT:    v_alignbit_b32 v8, v21, v57, 8
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v41
-; SI-NEXT:    v_alignbit_b32 v63, v8, v49, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v34
+; SI-NEXT:    v_alignbit_b32 v18, v8, v15, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v63, v3, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v18, v40, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v63, v3, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v31
+; SI-NEXT:    v_alignbit_b32 v8, v18, v40, 16
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v8, v18, v40, 8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v63, v3, 8
-; SI-NEXT:    v_alignbit_b32 v12, v49, v54, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v28
+; SI-NEXT:    v_alignbit_b32 v63, v8, v51, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v12, v56, 24
+; SI-NEXT:    v_alignbit_b32 v8, v63, v53, 24
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v12, v56, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v63, v53, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v33
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v12, v56, 8
+; SI-NEXT:    v_alignbit_b32 v8, v63, v53, 8
+; SI-NEXT:    v_alignbit_b32 v12, v40, v43, 16
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v39
-; SI-NEXT:    v_alignbit_b32 v9, v8, v43, 16
-; SI-NEXT:    v_mov_b32_e32 v43, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v24
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v12, v52, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v12, v52, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v37
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v12, v52, 8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v43
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v42
+; SI-NEXT:    v_mov_b32_e32 v15, v9
+; SI-NEXT:    v_alignbit_b32 v9, v8, v54, 16
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v9, v1, 24
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v21
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v9, v1, 16
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v29
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v9, v1, 8
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v61
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v8, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 24, v49
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v42
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 24, v46
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 24, v56
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v8
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v41
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v34
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v59
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v38
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v63
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v28
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v60
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v15
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v31
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v33
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v45
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 24, v20
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v12
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 8, v12
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v15
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v30
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v39
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 8, v24
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v20
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v42
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v39
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v18
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v23
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v9
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v24, v51
-; SI-NEXT:    v_mov_b32_e32 v51, v2
-; SI-NEXT:    v_mov_b32_e32 v2, v22
-; SI-NEXT:    v_mov_b32_e32 v22, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 8, v58
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 8, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 8, v55
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 8, v35
+; SI-NEXT:    v_mov_b32_e32 v20, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 8, v18
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 8, v9
+; SI-NEXT:    v_alignbit_b32 v26, v24, v60, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 8, v44
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 8, v47
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 8, v50
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 8, v62
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 8, v55
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 8, v35
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v27
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v37, v34
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 8, v27
-; SI-NEXT:    v_mov_b32_e32 v15, v43
-; SI-NEXT:    v_mov_b32_e32 v43, v39
+; SI-NEXT:    v_mov_b32_e32 v29, v28
+; SI-NEXT:    v_mov_b32_e32 v23, v48
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 8, v63
+; SI-NEXT:    v_mov_b32_e32 v48, v33
+; SI-NEXT:    v_mov_b32_e32 v34, v53
+; SI-NEXT:    v_mov_b32_e32 v53, v42
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_branch .LBB91_3
 ; SI-NEXT:  .LBB91_2:
-; SI-NEXT:    v_mov_b32_e32 v40, v29
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -170917,116 +170806,139 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v43, v39
-; SI-NEXT:    v_mov_b32_e32 v32, v49
-; SI-NEXT:    v_mov_b32_e32 v48, v34
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v15, v25
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    v_mov_b32_e32 v53, v42
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_mov_b32_e32 v48, v33
+; SI-NEXT:    v_mov_b32_e32 v29, v28
+; SI-NEXT:    v_mov_b32_e32 v37, v34
+; SI-NEXT:    v_mov_b32_e32 v17, v63
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; kill: killed $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr24
 ; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr20
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr60
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; kill: killed $vgpr17
-; SI-NEXT:    ; kill: killed $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; kill: killed $vgpr17
+; SI-NEXT:    ; kill: killed $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; kill: killed $vgpr56
 ; SI-NEXT:  .LBB91_3: ; %Flow
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v56, v17
+; SI-NEXT:    v_mov_b32_e32 v54, v61
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v42, v32
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_vccnz .LBB91_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v42
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v54
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v8
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v44, 0xffff0000, v44
-; SI-NEXT:    v_add_f32_e32 v46, 0x40c00000, v44
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_alignbit_b32 v56, v3, v2, 16
+; SI-NEXT:    v_alignbit_b32 v52, v3, v2, 16
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
@@ -171037,960 +170949,941 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v34, v4, v3, 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v33
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT:    v_alignbit_b32 v51, v5, v4, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v43
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v53
+; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_alignbit_b32 v5, v7, v5, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v8
+; SI-NEXT:    v_alignbit_b32 v57, v7, v5, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v26
 ; SI-NEXT:    v_alignbit_b32 v9, v7, v6, 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v32
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v31
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v48
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v11
-; SI-NEXT:    v_alignbit_b32 v12, v59, v7, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v11
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v11
+; SI-NEXT:    v_alignbit_b32 v12, v8, v7, 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_alignbit_b32 v6, v10, v6, 16
+; SI-NEXT:    v_alignbit_b32 v60, v10, v6, 16
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v7
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v7
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v7
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v39
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_alignbit_b32 v7, v13, v7, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v31
 ; SI-NEXT:    v_alignbit_b32 v63, v13, v10, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v48
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v10
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v29
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v14
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v37
+; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_alignbit_b32 v10, v16, v10, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v17
-; SI-NEXT:    v_alignbit_b32 v61, v16, v13, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v40
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v13
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v13
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v41
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v17
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_alignbit_b32 v13, v19, v13, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
-; SI-NEXT:    v_alignbit_b32 v21, v19, v16, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_alignbit_b32 v10, v14, v10, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v32
+; SI-NEXT:    v_alignbit_b32 v18, v14, v13, 16
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v51
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v16
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v49, 0x40c00000, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v49
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v21, v15, v14, 16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_alignbit_b32 v13, v16, v13, 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_alignbit_b32 v16, v19, v16, 16
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v31
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v37, v2, v19, 16
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_alignbit_b32 v24, v15, v14, 16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v20
+; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v24
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
 ; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_alignbit_b32 v57, v22, v16, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v37
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v21
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v61
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v63
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v12
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_alignbit_b32 v27, v15, v14, 16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v3, v22, v19, 16
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v59
+; SI-NEXT:    v_add_f32_e32 v54, 0x40c00000, v22
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v9
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v46
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v39, 0x40c00000, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v39
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v27, v2, v22, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v54
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v59, 0x40c00000, v44
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v59
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_alignbit_b32 v30, v15, v14, 16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
 ; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    v_alignbit_b32 v19, v25, v19, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v49, 0x40c00000, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v49
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v30, v2, v25, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_alignbit_b32 v53, v28, v22, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v30
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v25
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v51, 0x40c00000, v25
+; SI-NEXT:    v_alignbit_b32 v4, v25, v22, 16
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v17
+; SI-NEXT:    v_add_f32_e32 v40, 0x40c00000, v25
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v40
+; SI-NEXT:    v_alignbit_b32 v35, v45, v14, 16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v51
-; SI-NEXT:    v_alignbit_b32 v35, v2, v28, 16
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT:    v_alignbit_b32 v25, v32, v25, 16
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 8, v35
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v52, 0x40c00000, v28
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v18
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v52
-; SI-NEXT:    v_alignbit_b32 v38, v24, v32, 16
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v15
 ; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT:    v_alignbit_b32 v5, v28, v25, 16
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v41, 0x40c00000, v28
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v41
+; SI-NEXT:    v_alignbit_b32 v38, v15, v14, 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v56
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v18
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
 ; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; SI-NEXT:    v_alignbit_b32 v28, v33, v28, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v54, 0x40c00000, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v54
-; SI-NEXT:    v_alignbit_b32 v55, v18, v32, 16
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT:    v_alignbit_b32 v2, v33, v28, 16
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v43, 0x40c00000, v33
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v23
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v43
+; SI-NEXT:    v_alignbit_b32 v55, v61, v14, 16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
 ; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; SI-NEXT:    v_alignbit_b32 v33, v36, v33, 16
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT:    v_alignbit_b32 v6, v36, v33, 16
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v36
-; SI-NEXT:    v_add_f32_e32 v41, 0x40c00000, v36
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v41
-; SI-NEXT:    v_alignbit_b32 v62, v15, v32, 16
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v46, 0x40c00000, v36
+; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v46
 ; SI-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 24, v41
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v62, v15, v14, 16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v39, 0xffff0000, v17
+; SI-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; SI-NEXT:    v_alignbit_b32 v36, v39, v36, 16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v39, 0xffff0000, v39
+; SI-NEXT:    v_add_f32_e32 v42, 0x40c00000, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v42
+; SI-NEXT:    v_alignbit_b32 v50, v17, v14, 16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v39, 0xffff0000, v39
+; SI-NEXT:    v_add_f32_e32 v56, 0x40c00000, v39
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v56
+; SI-NEXT:    v_alignbit_b32 v47, v25, v14, 16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v39, 0xffff0000, v39
+; SI-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v39
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 24, v54
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 8, v50
+; SI-NEXT:    v_alignbit_b32 v58, v22, v14, 16
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 24, v52
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v56
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 24, v51
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v42
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 8, v63
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 24, v49
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v46
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 8, v55
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 24, v39
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v43
+; SI-NEXT:    v_alignbit_b32 v43, v38, v16, 8
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 24, v31
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v41
+; SI-NEXT:    v_alignbit_b32 v41, v38, v16, 16
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 24, v20
-; SI-NEXT:    v_mov_b32_e32 v49, v59
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v62
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 8, v55
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v54
+; SI-NEXT:    v_alignbit_b32 v54, v38, v16, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 8, v27
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; SI-NEXT:    v_alignbit_b32 v36, v48, v36, 16
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
-; SI-NEXT:    v_add_f32_e32 v40, 0x40c00000, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v40
-; SI-NEXT:    v_alignbit_b32 v50, v15, v32, 16
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v20
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v20, v35, v13, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v46
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v46, v35, v13, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 8, v50
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
-; SI-NEXT:    v_add_f32_e32 v42, 0x40c00000, v48
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v42
-; SI-NEXT:    v_alignbit_b32 v47, v22, v32, 16
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
-; SI-NEXT:    v_add_f32_e32 v43, 0x40c00000, v48
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v43
-; SI-NEXT:    v_alignbit_b32 v58, v23, v32, 16
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 8, v35
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v43
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v23
+; SI-NEXT:    v_alignbit_b32 v23, v62, v4, 8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v42
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 8, v47
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 8, v18
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 8, v62
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v58
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v26
+; SI-NEXT:    v_alignbit_b32 v26, v24, v60, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_alignbit_b32 v44, v19, v14, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v59
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v40
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 8, v58
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    v_alignbit_b32 v44, v16, v32, 16
 ; SI-NEXT:    v_alignbit_b32 v8, v44, v36, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v44, v36, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v58, v33, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v58, v33, 16
+; SI-NEXT:    v_alignbit_b32 v8, v44, v36, 8
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v58, v33, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v58, v6, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v47, v28, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v58, v6, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v47, v28, 16
+; SI-NEXT:    v_alignbit_b32 v8, v58, v6, 8
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v47, v28, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v47, v2, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v50, v25, 24
+; SI-NEXT:    v_alignbit_b32 v8, v47, v2, 16
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v50, v25, 16
+; SI-NEXT:    v_alignbit_b32 v8, v47, v2, 8
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v50, v25, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v50, v5, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v62, v53, 24
+; SI-NEXT:    v_alignbit_b32 v8, v50, v5, 16
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v62, v53, 16
+; SI-NEXT:    v_alignbit_b32 v8, v50, v5, 8
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v62, v53, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v62, v4, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v55, v19, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v62, v4, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v55, v19, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v55, v19, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v38, v57, 24
+; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 8
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v38, v57, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v38, v57, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v35, v13, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v35, v13, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v30, v10, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v30, v10, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v30, v10, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v27, v7, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v27, v7, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v27, v7, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v37, v6, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v24, v60, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v37, v6, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v24, v60, 8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v37, v6, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v21, v57, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v21, v5, 24
+; SI-NEXT:    v_alignbit_b32 v8, v21, v57, 16
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v21, v5, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v21, v57, 8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v21, v5, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v18, v51, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v61, v4, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v18, v51, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v8, v18, v51, 8
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 8, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 8, v44
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v61, v4, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v21
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v61, v4, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v63, v34, 24
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 8, v9
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v63, v3, 24
+; SI-NEXT:    v_alignbit_b32 v8, v63, v34, 16
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v63, v3, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v63, v34, 8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v63, v3, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v12, v52, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v12, v56, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, v12, v52, 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v12, v56, 16
+; SI-NEXT:    v_alignbit_b32 v8, v12, v52, 8
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v12, v56, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v9, v1, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v9, v1, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v8, v9, v1, 8
-; SI-NEXT:    v_alignbit_b32 v52, v44, v36, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 8, v44
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v47
-; SI-NEXT:  .LBB91_5: ; %end
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v38
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v30
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v27
+; SI-NEXT:  .LBB91_5: ; %end
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v36, 0xff, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 8, v52
-; SI-NEXT:    v_or_b32_e32 v36, v36, v52
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 8, v26
-; SI-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 8, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v52, 0xff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v17
-; SI-NEXT:    v_or_b32_e32 v51, v51, v52
-; SI-NEXT:    v_or_b32_e32 v36, v36, v51
-; SI-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v36, 0xff, v44
-; SI-NEXT:    v_or_b32_e32 v36, v36, v39
-; SI-NEXT:    v_and_b32_e32 v39, 0xff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 8, v28
+; SI-NEXT:    v_or_b32_e32 v32, v36, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v15
-; SI-NEXT:    v_or_b32_e32 v39, v51, v39
-; SI-NEXT:    v_or_b32_e32 v36, v36, v39
-; SI-NEXT:    v_add_i32_e32 v39, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v36, v39, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; SI-NEXT:    v_or_b32_e32 v17, v33, v17
+; SI-NEXT:    v_and_b32_e32 v36, 0xff, v29
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v33, 0xff, v26
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 24, v26
-; SI-NEXT:    v_or_b32_e32 v33, v36, v33
-; SI-NEXT:    v_or_b32_e32 v17, v17, v33
-; SI-NEXT:    v_add_i32_e32 v33, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v17, v33, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v29
+; SI-NEXT:    v_or_b32_e32 v36, v56, v36
+; SI-NEXT:    v_or_b32_e32 v32, v32, v36
+; SI-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v58
-; SI-NEXT:    v_or_b32_e32 v17, v17, v31
-; SI-NEXT:    v_and_b32_e32 v31, 0xff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v15
-; SI-NEXT:    v_or_b32_e32 v31, v32, v31
-; SI-NEXT:    v_or_b32_e32 v17, v17, v31
-; SI-NEXT:    v_add_i32_e32 v31, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v17, v31, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v32, 0xff, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v36, 8, v51
+; SI-NEXT:    v_or_b32_e32 v32, v32, v36
+; SI-NEXT:    v_and_b32_e32 v36, 0xff, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; SI-NEXT:    v_or_b32_e32 v14, v14, v36
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; SI-NEXT:    v_or_b32_e32 v14, v32, v14
+; SI-NEXT:    v_add_i32_e32 v32, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v14, v32, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v28
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 8, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v17, v17, v28
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 8, v19
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v14, v14, v32
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v28, 0xff, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; SI-NEXT:    v_and_b32_e32 v32, 0xff, v19
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 24, v23
-; SI-NEXT:    v_or_b32_e32 v28, v31, v28
-; SI-NEXT:    v_or_b32_e32 v17, v17, v28
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v17, v28, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v47
-; SI-NEXT:    v_or_b32_e32 v8, v17, v8
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 24, v19
+; SI-NEXT:    v_or_b32_e32 v32, v33, v32
+; SI-NEXT:    v_or_b32_e32 v14, v14, v32
+; SI-NEXT:    v_add_i32_e32 v32, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v14, v32, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v58
+; SI-NEXT:    v_or_b32_e32 v11, v14, v11
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 24, v15
-; SI-NEXT:    v_or_b32_e32 v17, v28, v17
-; SI-NEXT:    v_or_b32_e32 v8, v8, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v8, v17, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v6
+; SI-NEXT:    v_or_b32_e32 v14, v32, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v25
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v2
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v60
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; SI-NEXT:    v_or_b32_e32 v8, v8, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 24, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_or_b32_e32 v17, v25, v17
-; SI-NEXT:    v_or_b32_e32 v8, v8, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v8, v17, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v14, v28, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v59
-; SI-NEXT:    v_or_b32_e32 v8, v8, v17
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v15
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v49
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v15
-; SI-NEXT:    v_or_b32_e32 v17, v25, v17
-; SI-NEXT:    v_or_b32_e32 v8, v8, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v8, v17, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 24, v2
+; SI-NEXT:    v_or_b32_e32 v14, v28, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v53
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v57
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; SI-NEXT:    v_or_b32_e32 v8, v8, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 24, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_or_b32_e32 v17, v22, v17
-; SI-NEXT:    v_or_b32_e32 v8, v8, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v8, v17, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v14, v25, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v62
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v39
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
-; SI-NEXT:    v_or_b32_e32 v11, v17, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v2
+; SI-NEXT:    v_or_b32_e32 v14, v25, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v19
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v23
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, v17, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v20
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 24, v17
+; SI-NEXT:    v_or_b32_e32 v14, v22, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v62
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v31
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
-; SI-NEXT:    v_or_b32_e32 v11, v17, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 24, v17
+; SI-NEXT:    v_or_b32_e32 v14, v22, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v57
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v17
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v14, v19, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v46
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v61
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v17
+; SI-NEXT:    v_or_b32_e32 v14, v19, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v43
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v54
+; SI-NEXT:    v_or_b32_e32 v14, v16, v14
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v38
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v2
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, v16, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v14, v16, v14
+; SI-NEXT:    v_or_b32_e32 v11, v11, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v20
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v60
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v15
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
-; SI-NEXT:    v_or_b32_e32 v11, v16, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v46
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; SI-NEXT:    v_or_b32_e32 v10, v10, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
 ; SI-NEXT:    v_or_b32_e32 v11, v13, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v10, v10, v11
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v45
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v30
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v2
+; SI-NEXT:    v_or_b32_e32 v10, v10, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v2
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v13, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v2
-; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v8, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v10, v10, v11
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v14
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; SI-NEXT:    v_or_b32_e32 v7, v7, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v2
 ; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v8, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v2
-; SI-NEXT:    v_or_b32_e32 v8, v10, v8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v7, v7, v10
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v18
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v8, v10, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v26
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v37
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v24
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v7
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v2
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v2
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v21
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v2
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v2
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v2
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v52
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v2
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v61
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v2
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v18
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v2
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v63
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v42
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v56
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -172001,14 +171894,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v12
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v49
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
@@ -172018,12 +171911,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -172034,10 +171927,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v9
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
@@ -172075,8 +171968,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VI-NEXT:    v_writelane_b32 v63, s30, 0
 ; VI-NEXT:    v_writelane_b32 v63, s31, 1
@@ -172881,26 +172774,26 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_alignbit_b32 v31, v31, v33, 16
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[29:30]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -172911,129 +172804,126 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[5:6]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[3:4]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[1:2]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v32
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v16
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 24, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v3
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v3
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 24, v2
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
-; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v12
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v2
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 24, v2
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 24, v8
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v2
+; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[23:24]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; VI-NEXT:    v_lshrrev_b32_e32 v45, 24, v16
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v16
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v13
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v5
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v7
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v49, 24, v18
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v52, 8, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v45, 24, v16
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v15
 ; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v15
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v14
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v14
 ; VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v13
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
@@ -173046,21 +172936,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
 ; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v7
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 24, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v5
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v4
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v7
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v45, 8, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 24, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v3
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v1
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    s_branch .LBB91_5
 ; VI-NEXT:  .LBB91_3:
 ; VI-NEXT:    ; implicit-def: $sgpr46
@@ -173220,29 +173112,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    s_branch .LBB91_2
 ; VI-NEXT:  .LBB91_4:
 ; VI-NEXT:    v_mov_b32_e32 v33, s71
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s69
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s70
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v33, s70
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s68
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s67
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s66
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s64
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s65
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s54
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s53
-; VI-NEXT:    v_mov_b32_e32 v31, s4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s86
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v33, s83
+; VI-NEXT:    v_mov_b32_e32 v31, s4
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v33, s82
 ; VI-NEXT:    v_readlane_b32 s4, v62, 0
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 1
 ; VI-NEXT:    v_mov_b32_e32 v40, s4
@@ -173274,170 +173160,171 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_readlane_b32 s4, v62, 13
 ; VI-NEXT:    v_mov_b32_e32 v46, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 14
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v33, s4
+; VI-NEXT:    v_mov_b32_e32 v50, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 15
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 17
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 18
-; VI-NEXT:    v_mov_b32_e32 v36, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 19
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v33, s4
+; VI-NEXT:    v_readlane_b32 s4, v62, 19
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 20
-; VI-NEXT:    v_mov_b32_e32 v52, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 21
-; VI-NEXT:    v_mov_b32_e32 v49, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 22
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v49, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 23
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 24
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 25
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 26
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 27
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 28
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 29
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 30
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 31
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 32
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 33
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 34
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 35
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 36
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 37
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 38
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 39
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 40
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 41
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 42
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 43
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 44
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 45
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 46
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 47
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 48
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 49
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 50
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 51
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 52
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 53
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 54
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 55
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 56
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 57
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s4
-; VI-NEXT:    v_mov_b32_e32 v42, s51
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v42, s54
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_mov_b32_e32 v41, s46
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v41, s56
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v41, s58
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s58
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v41, s60
 ; VI-NEXT:    v_mov_b32_e32 v45, s72
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v45, s74
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v45, s76
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v45, s78
 ; VI-NEXT:    v_mov_b32_e32 v55, s88
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v61, s50
-; VI-NEXT:    v_mov_b32_e32 v58, s83
-; VI-NEXT:    v_mov_b32_e32 v55, v48
-; VI-NEXT:    v_mov_b32_e32 v48, v47
-; VI-NEXT:    v_mov_b32_e32 v57, s30
-; VI-NEXT:    v_mov_b32_e32 v35, s83
-; VI-NEXT:    v_mov_b32_e32 v60, s34
-; VI-NEXT:    v_mov_b32_e32 v42, s36
+; VI-NEXT:    v_mov_b32_e32 v36, s66
+; VI-NEXT:    v_mov_b32_e32 v52, s64
+; VI-NEXT:    v_mov_b32_e32 v55, v50
+; VI-NEXT:    v_mov_b32_e32 v35, s30
+; VI-NEXT:    v_mov_b32_e32 v59, s87
+; VI-NEXT:    v_mov_b32_e32 v58, s34
+; VI-NEXT:    v_mov_b32_e32 v45, s36
 ; VI-NEXT:    v_mov_b32_e32 v34, s38
 ; VI-NEXT:    v_mov_b32_e32 v1, s44
 ; VI-NEXT:    v_mov_b32_e32 v2, s45
@@ -173470,44 +173357,44 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v29, s28
 ; VI-NEXT:    v_mov_b32_e32 v30, s29
 ; VI-NEXT:    v_mov_b32_e32 v32, s5
-; VI-NEXT:    v_mov_b32_e32 v38, s87
-; VI-NEXT:    v_mov_b32_e32 v37, s82
 ; VI-NEXT:    v_mov_b32_e32 v41, s62
-; VI-NEXT:    v_mov_b32_e32 v59, s84
-; VI-NEXT:    v_mov_b32_e32 v39, s51
+; VI-NEXT:    v_mov_b32_e32 v57, s81
+; VI-NEXT:    v_mov_b32_e32 v37, s84
+; VI-NEXT:    v_mov_b32_e32 v60, s52
+; VI-NEXT:    v_mov_b32_e32 v38, s51
+; VI-NEXT:    v_mov_b32_e32 v61, s65
+; VI-NEXT:    v_mov_b32_e32 v49, s66
+; VI-NEXT:    v_mov_b32_e32 v39, s55
 ; VI-NEXT:    v_mov_b32_e32 v50, v46
+; VI-NEXT:    v_mov_b32_e32 v46, v48
+; VI-NEXT:    v_mov_b32_e32 v48, v47
 ; VI-NEXT:    v_mov_b32_e32 v47, v56
 ; VI-NEXT:    v_mov_b32_e32 v56, v51
 ; VI-NEXT:    v_mov_b32_e32 v51, s90
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v35, s85
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v34, s48
 ; VI-NEXT:    v_mov_b32_e32 v51, v53
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
 ; VI-NEXT:    v_mov_b32_e32 v54, v40
 ; VI-NEXT:    v_mov_b32_e32 v40, s80
-; VI-NEXT:    v_mov_b32_e32 v57, s81
-; VI-NEXT:    v_mov_b32_e32 v58, s85
-; VI-NEXT:    v_mov_b32_e32 v60, s50
-; VI-NEXT:    v_mov_b32_e32 v61, s52
-; VI-NEXT:    v_mov_b32_e32 v42, s55
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v58, s50
+; VI-NEXT:    v_mov_b32_e32 v45, s53
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB91_5: ; %end
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v36
+; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v33
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v52
-; VI-NEXT:    v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_readlane_b32 s87, v63, 31
 ; VI-NEXT:    v_readlane_b32 s86, v63, 30
 ; VI-NEXT:    v_readlane_b32 s85, v63, 29
@@ -173540,35 +173427,44 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_readlane_b32 s34, v63, 2
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v33
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v45
+; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v33
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v49
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 4, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 8, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -173579,23 +173475,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 12, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 16, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -173606,20 +173502,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 20, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v41
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 24, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -173632,21 +173528,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -173657,23 +173553,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 36, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 40, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -173684,23 +173580,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 44, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 48, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -173711,23 +173607,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 52, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 56, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -173738,95 +173634,90 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 60, v0
 ; VI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 64, v0
 ; VI-NEXT:    buffer_store_dword v1, v17, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v49
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v61
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v42
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v45
+; VI-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v61
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v38
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v60
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v59
 ; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v58
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v37
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v35
 ; VI-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v57
 ; VI-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -173880,24 +173771,22 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v48
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v43
 ; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x74, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
 ; VI-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
@@ -173923,8 +173812,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -173987,8 +173876,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_readfirstlane_b32 s59, v14
 ; GFX9-NEXT:    v_readfirstlane_b32 s56, v15
 ; GFX9-NEXT:    v_readfirstlane_b32 s57, v16
-; GFX9-NEXT:    v_readfirstlane_b32 s44, v17
-; GFX9-NEXT:    v_readfirstlane_b32 s45, v18
+; GFX9-NEXT:    v_readfirstlane_b32 s46, v17
+; GFX9-NEXT:    v_readfirstlane_b32 s47, v18
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
@@ -174103,51 +173992,51 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_writelane_b32 v62, s41, 3
 ; GFX9-NEXT:    s_lshr_b64 s[40:41], s[22:23], 24
 ; GFX9-NEXT:    v_writelane_b32 v62, s40, 0
-; GFX9-NEXT:    s_lshr_b32 s53, s45, 24
-; GFX9-NEXT:    s_lshr_b32 s15, s45, 16
-; GFX9-NEXT:    s_lshr_b32 s70, s45, 8
-; GFX9-NEXT:    s_lshr_b32 s7, s44, 16
-; GFX9-NEXT:    s_lshr_b32 s6, s44, 8
-; GFX9-NEXT:    s_lshr_b32 s65, s57, 24
+; GFX9-NEXT:    s_lshr_b32 s70, s47, 24
+; GFX9-NEXT:    s_lshr_b32 s15, s47, 16
+; GFX9-NEXT:    s_lshr_b32 s7, s47, 8
+; GFX9-NEXT:    s_lshr_b32 s53, s46, 16
+; GFX9-NEXT:    s_lshr_b32 s52, s46, 8
+; GFX9-NEXT:    s_lshr_b32 s67, s57, 24
 ; GFX9-NEXT:    s_lshr_b32 s14, s57, 16
-; GFX9-NEXT:    s_lshr_b32 s68, s57, 8
-; GFX9-NEXT:    s_lshr_b32 s69, s56, 16
-; GFX9-NEXT:    s_lshr_b32 s38, s56, 8
-; GFX9-NEXT:    s_lshr_b32 s54, s59, 24
+; GFX9-NEXT:    s_lshr_b32 s69, s57, 8
+; GFX9-NEXT:    s_lshr_b32 s6, s56, 16
+; GFX9-NEXT:    s_lshr_b32 s71, s56, 8
+; GFX9-NEXT:    s_lshr_b32 s64, s59, 24
 ; GFX9-NEXT:    s_lshr_b32 s13, s59, 16
-; GFX9-NEXT:    s_lshr_b32 s52, s59, 8
-; GFX9-NEXT:    s_lshr_b32 s67, s58, 16
-; GFX9-NEXT:    s_lshr_b32 s66, s58, 8
-; GFX9-NEXT:    s_lshr_b32 s97, s61, 24
+; GFX9-NEXT:    s_lshr_b32 s66, s59, 8
+; GFX9-NEXT:    s_lshr_b32 s51, s58, 16
+; GFX9-NEXT:    s_lshr_b32 s68, s58, 8
+; GFX9-NEXT:    s_lshr_b32 s99, s61, 24
 ; GFX9-NEXT:    s_lshr_b32 s12, s61, 16
-; GFX9-NEXT:    s_lshr_b32 s51, s61, 8
-; GFX9-NEXT:    s_lshr_b32 s64, s60, 16
-; GFX9-NEXT:    s_lshr_b32 s55, s60, 8
-; GFX9-NEXT:    s_lshr_b32 s86, s63, 24
+; GFX9-NEXT:    s_lshr_b32 s55, s61, 8
+; GFX9-NEXT:    s_lshr_b32 s50, s60, 16
+; GFX9-NEXT:    s_lshr_b32 s65, s60, 8
+; GFX9-NEXT:    s_lshr_b32 s96, s63, 24
 ; GFX9-NEXT:    s_lshr_b32 s11, s63, 16
-; GFX9-NEXT:    s_lshr_b32 s50, s63, 8
-; GFX9-NEXT:    s_lshr_b32 s99, s62, 16
-; GFX9-NEXT:    s_lshr_b32 s98, s62, 8
-; GFX9-NEXT:    s_lshr_b32 s83, s73, 24
+; GFX9-NEXT:    s_lshr_b32 s98, s63, 8
+; GFX9-NEXT:    s_lshr_b32 s49, s62, 16
+; GFX9-NEXT:    s_lshr_b32 s54, s62, 8
+; GFX9-NEXT:    s_lshr_b32 s85, s73, 24
 ; GFX9-NEXT:    s_lshr_b32 s10, s73, 16
-; GFX9-NEXT:    s_lshr_b32 s49, s73, 8
-; GFX9-NEXT:    s_lshr_b32 s96, s72, 16
-; GFX9-NEXT:    s_lshr_b32 s87, s72, 8
-; GFX9-NEXT:    s_lshr_b32 s80, s75, 24
+; GFX9-NEXT:    s_lshr_b32 s87, s73, 8
+; GFX9-NEXT:    s_lshr_b32 s48, s72, 16
+; GFX9-NEXT:    s_lshr_b32 s97, s72, 8
+; GFX9-NEXT:    s_lshr_b32 s82, s75, 24
 ; GFX9-NEXT:    s_lshr_b32 s9, s75, 16
-; GFX9-NEXT:    s_lshr_b32 s48, s75, 8
-; GFX9-NEXT:    s_lshr_b32 s85, s74, 16
-; GFX9-NEXT:    s_lshr_b32 s84, s74, 8
-; GFX9-NEXT:    s_lshr_b32 s71, s77, 24
+; GFX9-NEXT:    s_lshr_b32 s84, s75, 8
+; GFX9-NEXT:    s_lshr_b32 s39, s74, 16
+; GFX9-NEXT:    s_lshr_b32 s86, s74, 8
+; GFX9-NEXT:    s_lshr_b32 s80, s77, 24
 ; GFX9-NEXT:    s_lshr_b32 s8, s77, 16
-; GFX9-NEXT:    s_lshr_b32 s39, s77, 8
-; GFX9-NEXT:    s_lshr_b32 s82, s76, 16
-; GFX9-NEXT:    s_lshr_b32 s81, s76, 8
+; GFX9-NEXT:    s_lshr_b32 s81, s77, 8
+; GFX9-NEXT:    s_lshr_b32 s38, s76, 16
+; GFX9-NEXT:    s_lshr_b32 s83, s76, 8
 ; GFX9-NEXT:    v_writelane_b32 v62, s41, 1
 ; GFX9-NEXT:    s_lshr_b64 s[40:41], s[20:21], 24
 ; GFX9-NEXT:    s_lshr_b64 s[42:43], s[18:19], 24
-; GFX9-NEXT:    s_lshr_b64 s[46:47], s[16:17], 24
-; GFX9-NEXT:    s_lshr_b64 s[78:79], s[44:45], 24
+; GFX9-NEXT:    s_lshr_b64 s[44:45], s[16:17], 24
+; GFX9-NEXT:    s_lshr_b64 s[78:79], s[46:47], 24
 ; GFX9-NEXT:    s_lshr_b64 s[88:89], s[56:57], 24
 ; GFX9-NEXT:    s_lshr_b64 s[90:91], s[58:59], 24
 ; GFX9-NEXT:    s_lshr_b64 s[92:93], s[60:61], 24
@@ -174158,697 +174047,698 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_4
 ; GFX9-NEXT:  .LBB91_2: ; %cmp.true
 ; GFX9-NEXT:    s_and_b32 s6, s77, 0xffff0000
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40c00000
-; GFX9-NEXT:    v_add_f32_e32 v1, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40c00000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    s_lshl_b32 s6, s77, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, 0x7fff, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-NEXT:    s_and_b32 s6, s76, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v2, v4, 16, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v1
-; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshl_or_b32 v8, v5, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_lshl_b32 s6, s76, 16
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_add_f32_e32 v3, s6, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_add_f32_e32 v3, s6, v1
 ; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX9-NEXT:    s_and_b32 s6, s75, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v3, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v7, v2, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    s_lshl_b32 s6, s75, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v3, s6, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v31
+; GFX9-NEXT:    s_and_b32 s6, s74, 0xffff0000
+; GFX9-NEXT:    v_lshl_or_b32 v14, v5, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    s_lshl_b32 s6, s74, 16
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_add_f32_e32 v3, s6, v1
 ; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_add_u32_e32 v4, 0x7fff, v4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v32
-; GFX9-NEXT:    s_and_b32 s6, s74, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v4, v7, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v3, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v3
-; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_add_u32_e32 v6, 0x7fff, v6
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    s_lshl_b32 s6, s74, 16
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v34
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v33
 ; GFX9-NEXT:    s_and_b32 s6, s73, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v3, v3, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v13, v2, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    s_lshl_b32 s6, s73, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v35
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v34
 ; GFX9-NEXT:    s_and_b32 s6, s72, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v10, v33, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_lshl_or_b32 v16, v32, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_lshl_b32 s6, s72, 16
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_add_f32_e32 v7, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v8, v8, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 0x7fff, v8
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v37
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_add_f32_e32 v3, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, 0x7fff, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v36
 ; GFX9-NEXT:    s_and_b32 s6, s63, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v9, v6, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v15, v2, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    s_lshl_b32 s6, s63, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v38
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v37
 ; GFX9-NEXT:    s_and_b32 s6, s62, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v16, v36, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_lshl_or_b32 v18, v35, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_lshl_b32 s6, s62, 16
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_add_f32_e32 v7, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v8, v8, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 0x7fff, v8
-; GFX9-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v48
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_add_f32_e32 v3, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, 0x7fff, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v39
 ; GFX9-NEXT:    s_and_b32 s6, s61, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v15, v6, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v17, v2, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    s_lshl_b32 s6, s61, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v49
+; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v48
 ; GFX9-NEXT:    s_and_b32 s6, s60, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v21, v39, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_lshl_or_b32 v20, v38, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_lshl_b32 s6, s60, 16
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_add_f32_e32 v7, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v8, v8, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 0x7fff, v8
-; GFX9-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v51
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_add_f32_e32 v3, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, 0x7fff, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v50
 ; GFX9-NEXT:    s_and_b32 s6, s59, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v20, v6, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v19, v2, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    s_lshl_b32 s6, s59, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v52
+; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v51
 ; GFX9-NEXT:    s_and_b32 s6, s58, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v23, v50, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_lshl_or_b32 v22, v49, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_lshl_b32 s6, s58, 16
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_add_f32_e32 v7, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v8, v8, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 0x7fff, v8
-; GFX9-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v54
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_add_f32_e32 v3, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, 0x7fff, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v53
 ; GFX9-NEXT:    s_and_b32 s6, s57, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v22, v6, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v21, v2, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    s_lshl_b32 s6, s57, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v55
+; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v54
 ; GFX9-NEXT:    s_and_b32 s6, s56, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v25, v53, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_lshl_or_b32 v24, v52, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_lshl_b32 s6, s56, 16
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_add_f32_e32 v7, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v8, v8, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 0x7fff, v8
-; GFX9-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v41
-; GFX9-NEXT:    s_and_b32 s6, s45, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v24, v6, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    s_lshl_b32 s6, s45, 16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v42
-; GFX9-NEXT:    s_and_b32 s6, s44, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v27, v40, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 0x7fff, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    s_lshl_b32 s6, s44, 16
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_add_f32_e32 v7, s6, v5
-; GFX9-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX9-NEXT:    v_add_u32_e32 v8, v8, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 0x7fff, v8
-; GFX9-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v43
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_add_f32_e32 v3, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, 0x7fff, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v40
+; GFX9-NEXT:    s_and_b32 s6, s47, 0xffff0000
+; GFX9-NEXT:    v_lshl_or_b32 v23, v2, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    s_lshl_b32 s6, s47, 16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v41
+; GFX9-NEXT:    s_and_b32 s6, s46, 0xffff0000
+; GFX9-NEXT:    v_lshl_or_b32 v26, v55, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x7fff, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    s_lshl_b32 s6, s46, 16
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_add_f32_e32 v3, s6, v1
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, 0x7fff, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v42
 ; GFX9-NEXT:    s_and_b32 s6, s17, 0xffff0000
-; GFX9-NEXT:    v_lshl_or_b32 v26, v6, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_lshl_or_b32 v25, v2, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s11, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s17, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s17, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s16, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s16, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
 ; GFX9-NEXT:    s_lshr_b32 s16, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s19, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s44, s16, s8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s46, s16, s8
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s12, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s19, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s18, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s18, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
 ; GFX9-NEXT:    s_lshr_b32 s18, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s21, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s56, s18, s8
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s13, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s21, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s21, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s20, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s20, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
 ; GFX9-NEXT:    s_lshr_b32 s20, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s23, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s58, s20, s8
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s23, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s23, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s22, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s22, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
 ; GFX9-NEXT:    s_lshr_b32 s22, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s25, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s60, s22, s8
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s15, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s25, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s25, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s24, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s24, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
 ; GFX9-NEXT:    s_lshr_b32 s24, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s27, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s62, s24, s8
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s76, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s27, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s27, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s26, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s26, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
 ; GFX9-NEXT:    s_lshr_b32 s26, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s29, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s72, s26, s8
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s77, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s29, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s29, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s28, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
 ; GFX9-NEXT:    s_lshl_b32 s6, s28, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
 ; GFX9-NEXT:    s_lshr_b32 s28, s6, 16
 ; GFX9-NEXT:    s_and_b32 s6, s5, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s74, s28, s8
 ; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
 ; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX9-NEXT:    v_add_f32_e32 v6, s5, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s5, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX9-NEXT:    s_lshr_b32 s78, s6, 16
 ; GFX9-NEXT:    s_bfe_u32 s6, s5, 0x10010
 ; GFX9-NEXT:    s_add_i32 s6, s6, s5
 ; GFX9-NEXT:    s_add_i32 s8, s6, 0x7fff
 ; GFX9-NEXT:    s_bitset1_b32 s5, 22
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s5, s5, s8
 ; GFX9-NEXT:    s_and_b32 s6, s4, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v6, s6, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX9-NEXT:    s_bfe_u32 s8, s6, 0x10010
 ; GFX9-NEXT:    s_add_i32 s8, s8, s6
 ; GFX9-NEXT:    s_lshr_b32 s5, s5, 16
 ; GFX9-NEXT:    s_add_i32 s10, s8, 0x7fff
 ; GFX9-NEXT:    s_bitset1_b32 s6, 22
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s6, s6, s10
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX9-NEXT:    v_add_f32_e32 v5, s4, v5
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX9-NEXT:    v_add_f32_e32 v1, s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    s_bfe_u32 s8, s4, 0x10010
 ; GFX9-NEXT:    s_add_i32 s8, s8, s4
 ; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
 ; GFX9-NEXT:    s_add_i32 s10, s8, 0x7fff
 ; GFX9-NEXT:    s_bitset1_b32 s4, 22
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_lshrrev_b64 v[1:2], 24, v[25:26]
 ; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 24, v[23:24]
 ; GFX9-NEXT:    s_cselect_b32 s4, s4, s10
-; GFX9-NEXT:    v_lshrrev_b64 v[5:6], 24, v[26:27]
-; GFX9-NEXT:    v_lshrrev_b64 v[11:12], 24, v[20:21]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s45, s17, s11
+; GFX9-NEXT:    v_lshrrev_b64 v[3:4], 24, v[21:22]
+; GFX9-NEXT:    v_lshrrev_b64 v[9:10], 24, v[15:16]
+; GFX9-NEXT:    s_pack_ll_b32_b16 s47, s17, s11
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s57, s19, s12
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s59, s21, s13
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 24, v[24:25]
-; GFX9-NEXT:    v_lshrrev_b64 v[12:13], 24, v[15:16]
-; GFX9-NEXT:    v_lshrrev_b64 v[17:18], 24, v[3:4]
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 24, v[19:20]
+; GFX9-NEXT:    v_lshrrev_b64 v[10:11], 24, v[13:14]
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s61, s23, s14
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s63, s25, s15
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s73, s27, s76
@@ -174857,10 +174747,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s4, s6
 ; GFX9-NEXT:    s_lshr_b64 s[40:41], s[58:59], 24
 ; GFX9-NEXT:    s_lshr_b64 s[42:43], s[56:57], 24
-; GFX9-NEXT:    s_lshr_b64 s[46:47], s[44:45], 24
-; GFX9-NEXT:    v_lshrrev_b64 v[7:8], 24, v[22:23]
-; GFX9-NEXT:    v_lshrrev_b64 v[13:14], 24, v[9:10]
-; GFX9-NEXT:    v_lshrrev_b64 v[18:19], 24, v[1:2]
+; GFX9-NEXT:    s_lshr_b64 s[44:45], s[46:47], 24
+; GFX9-NEXT:    v_lshrrev_b64 v[5:6], 24, v[17:18]
+; GFX9-NEXT:    v_lshrrev_b64 v[11:12], 24, v[7:8]
 ; GFX9-NEXT:    s_lshr_b64 s[34:35], s[6:7], 24
 ; GFX9-NEXT:    s_lshr_b64 s[36:37], s[74:75], 24
 ; GFX9-NEXT:    s_lshr_b64 s[38:39], s[72:73], 24
@@ -174870,7 +174759,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_lshr_b32 s10, s7, 8
 ; GFX9-NEXT:    s_lshr_b32 s41, s6, 16
 ; GFX9-NEXT:    s_lshr_b32 s43, s6, 8
-; GFX9-NEXT:    s_lshr_b32 s47, s75, 24
+; GFX9-NEXT:    s_lshr_b32 s45, s75, 24
 ; GFX9-NEXT:    s_lshr_b32 s75, s75, 8
 ; GFX9-NEXT:    s_lshr_b32 s79, s74, 16
 ; GFX9-NEXT:    s_lshr_b32 s74, s74, 8
@@ -174894,42 +174783,42 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_lshr_b32 s57, s57, 8
 ; GFX9-NEXT:    s_lshr_b32 vcc_hi, s56, 16
 ; GFX9-NEXT:    s_lshr_b32 s56, s56, 8
-; GFX9-NEXT:    s_lshr_b32 s30, s45, 24
-; GFX9-NEXT:    s_lshr_b32 s45, s45, 8
-; GFX9-NEXT:    s_lshr_b32 s8, s44, 16
-; GFX9-NEXT:    s_lshr_b32 s7, s44, 8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 24, v27
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 8, v27
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v26
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 8, v26
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 24, v25
+; GFX9-NEXT:    s_lshr_b32 s30, s47, 24
+; GFX9-NEXT:    s_lshr_b32 s47, s47, 8
+; GFX9-NEXT:    s_lshr_b32 s8, s46, 16
+; GFX9-NEXT:    s_lshr_b32 s7, s46, 8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v26
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 8, v26
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
-; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 16, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 24, v24
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 8, v24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 24, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 16, v23
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 8, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v46, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 24, v22
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 8, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 24, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v46, 16, v21
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 8, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 24, v20
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 24, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 24, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 8, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 24, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 16, v15
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
-; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 24, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 24, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 24, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
 ; GFX9-NEXT:    s_branch .LBB91_5
 ; GFX9-NEXT:  .LBB91_3:
 ; GFX9-NEXT:    ; implicit-def: $sgpr6
@@ -174944,46 +174833,46 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-NEXT:    ; kill: killed $sgpr6
 ; GFX9-NEXT:    ; implicit-def: $sgpr78
+; GFX9-NEXT:    ; implicit-def: $sgpr83
+; GFX9-NEXT:    ; implicit-def: $sgpr38
 ; GFX9-NEXT:    ; implicit-def: $sgpr81
-; GFX9-NEXT:    ; implicit-def: $sgpr82
-; GFX9-NEXT:    ; implicit-def: $sgpr39
 ; GFX9-NEXT:    ; implicit-def: $sgpr8
-; GFX9-NEXT:    ; implicit-def: $sgpr71
+; GFX9-NEXT:    ; implicit-def: $sgpr80
+; GFX9-NEXT:    ; implicit-def: $sgpr86
+; GFX9-NEXT:    ; implicit-def: $sgpr39
 ; GFX9-NEXT:    ; implicit-def: $sgpr84
-; GFX9-NEXT:    ; implicit-def: $sgpr85
-; GFX9-NEXT:    ; implicit-def: $sgpr48
 ; GFX9-NEXT:    ; implicit-def: $sgpr9
-; GFX9-NEXT:    ; implicit-def: $sgpr80
+; GFX9-NEXT:    ; implicit-def: $sgpr82
+; GFX9-NEXT:    ; implicit-def: $sgpr97
+; GFX9-NEXT:    ; implicit-def: $sgpr48
 ; GFX9-NEXT:    ; implicit-def: $sgpr87
-; GFX9-NEXT:    ; implicit-def: $sgpr96
-; GFX9-NEXT:    ; implicit-def: $sgpr49
 ; GFX9-NEXT:    ; implicit-def: $sgpr10
-; GFX9-NEXT:    ; implicit-def: $sgpr83
+; GFX9-NEXT:    ; implicit-def: $sgpr85
+; GFX9-NEXT:    ; implicit-def: $sgpr54
+; GFX9-NEXT:    ; implicit-def: $sgpr49
 ; GFX9-NEXT:    ; implicit-def: $sgpr98
-; GFX9-NEXT:    ; implicit-def: $sgpr99
-; GFX9-NEXT:    ; implicit-def: $sgpr50
 ; GFX9-NEXT:    ; implicit-def: $sgpr11
-; GFX9-NEXT:    ; implicit-def: $sgpr86
+; GFX9-NEXT:    ; implicit-def: $sgpr96
+; GFX9-NEXT:    ; implicit-def: $sgpr65
+; GFX9-NEXT:    ; implicit-def: $sgpr50
 ; GFX9-NEXT:    ; implicit-def: $sgpr55
-; GFX9-NEXT:    ; implicit-def: $sgpr64
-; GFX9-NEXT:    ; implicit-def: $sgpr51
 ; GFX9-NEXT:    ; implicit-def: $sgpr12
-; GFX9-NEXT:    ; implicit-def: $sgpr97
+; GFX9-NEXT:    ; implicit-def: $sgpr99
+; GFX9-NEXT:    ; implicit-def: $sgpr68
+; GFX9-NEXT:    ; implicit-def: $sgpr51
 ; GFX9-NEXT:    ; implicit-def: $sgpr66
-; GFX9-NEXT:    ; implicit-def: $sgpr67
-; GFX9-NEXT:    ; implicit-def: $sgpr52
 ; GFX9-NEXT:    ; implicit-def: $sgpr13
-; GFX9-NEXT:    ; implicit-def: $sgpr54
-; GFX9-NEXT:    ; implicit-def: $sgpr38
+; GFX9-NEXT:    ; implicit-def: $sgpr64
+; GFX9-NEXT:    ; implicit-def: $sgpr71
 ; GFX9-NEXT:    ; implicit-def: $sgpr69
-; GFX9-NEXT:    ; implicit-def: $sgpr68
 ; GFX9-NEXT:    ; implicit-def: $sgpr14
-; GFX9-NEXT:    ; implicit-def: $sgpr65
+; GFX9-NEXT:    ; implicit-def: $sgpr67
+; GFX9-NEXT:    ; implicit-def: $sgpr52
+; GFX9-NEXT:    ; implicit-def: $sgpr53
 ; GFX9-NEXT:    ; implicit-def: $sgpr7
-; GFX9-NEXT:    ; implicit-def: $sgpr70
 ; GFX9-NEXT:    ; implicit-def: $sgpr15
-; GFX9-NEXT:    ; implicit-def: $sgpr53
-; GFX9-NEXT:    ; implicit-def: $sgpr46
+; GFX9-NEXT:    ; implicit-def: $sgpr70
+; GFX9-NEXT:    ; implicit-def: $sgpr44
 ; GFX9-NEXT:    ; implicit-def: $sgpr42
 ; GFX9-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-NEXT:    ; implicit-def: $sgpr36
@@ -175087,72 +174976,72 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s77
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NEXT:    v_mov_b32_e32 v24, s38
-; GFX9-NEXT:    v_mov_b32_e32 v21, s51
-; GFX9-NEXT:    v_mov_b32_e32 v16, s50
-; GFX9-NEXT:    v_mov_b32_e32 v10, s49
-; GFX9-NEXT:    v_mov_b32_e32 v4, s48
-; GFX9-NEXT:    v_mov_b32_e32 v2, s39
-; GFX9-NEXT:    v_mov_b32_e32 v17, s34
-; GFX9-NEXT:    v_mov_b32_e32 v18, s36
+; GFX9-NEXT:    v_mov_b32_e32 v46, s51
+; GFX9-NEXT:    v_mov_b32_e32 v56, s50
+; GFX9-NEXT:    v_mov_b32_e32 v58, s49
+; GFX9-NEXT:    v_mov_b32_e32 v60, s48
+; GFX9-NEXT:    v_mov_b32_e32 v27, s39
+; GFX9-NEXT:    v_mov_b32_e32 v29, s38
+; GFX9-NEXT:    v_mov_b32_e32 v10, s34
+; GFX9-NEXT:    v_mov_b32_e32 v11, s36
 ; GFX9-NEXT:    v_readlane_b32 s34, v62, 8
 ; GFX9-NEXT:    v_readlane_b32 s36, v62, 6
 ; GFX9-NEXT:    v_readlane_b32 s38, v62, 4
 ; GFX9-NEXT:    v_readlane_b32 s48, v62, 2
 ; GFX9-NEXT:    v_readlane_b32 s50, v62, 0
-; GFX9-NEXT:    v_mov_b32_e32 v43, s44
-; GFX9-NEXT:    v_mov_b32_e32 v42, s45
-; GFX9-NEXT:    v_mov_b32_e32 v40, s15
-; GFX9-NEXT:    v_mov_b32_e32 v41, s56
-; GFX9-NEXT:    v_mov_b32_e32 v55, s57
-; GFX9-NEXT:    v_mov_b32_e32 v53, s14
-; GFX9-NEXT:    v_mov_b32_e32 v54, s58
-; GFX9-NEXT:    v_mov_b32_e32 v52, s59
-; GFX9-NEXT:    v_mov_b32_e32 v50, s13
-; GFX9-NEXT:    v_mov_b32_e32 v51, s60
-; GFX9-NEXT:    v_mov_b32_e32 v49, s61
-; GFX9-NEXT:    v_mov_b32_e32 v39, s12
-; GFX9-NEXT:    v_mov_b32_e32 v48, s62
-; GFX9-NEXT:    v_mov_b32_e32 v38, s63
-; GFX9-NEXT:    v_mov_b32_e32 v36, s11
-; GFX9-NEXT:    v_mov_b32_e32 v37, s72
-; GFX9-NEXT:    v_mov_b32_e32 v35, s73
-; GFX9-NEXT:    v_mov_b32_e32 v33, s10
-; GFX9-NEXT:    v_mov_b32_e32 v34, s74
-; GFX9-NEXT:    v_mov_b32_e32 v32, s75
+; GFX9-NEXT:    v_mov_b32_e32 v42, s46
+; GFX9-NEXT:    v_mov_b32_e32 v41, s47
+; GFX9-NEXT:    v_mov_b32_e32 v55, s15
+; GFX9-NEXT:    v_mov_b32_e32 v40, s56
+; GFX9-NEXT:    v_mov_b32_e32 v54, s57
+; GFX9-NEXT:    v_mov_b32_e32 v52, s14
+; GFX9-NEXT:    v_mov_b32_e32 v53, s58
+; GFX9-NEXT:    v_mov_b32_e32 v51, s59
+; GFX9-NEXT:    v_mov_b32_e32 v49, s13
+; GFX9-NEXT:    v_mov_b32_e32 v50, s60
+; GFX9-NEXT:    v_mov_b32_e32 v48, s61
+; GFX9-NEXT:    v_mov_b32_e32 v38, s12
+; GFX9-NEXT:    v_mov_b32_e32 v39, s62
+; GFX9-NEXT:    v_mov_b32_e32 v37, s63
+; GFX9-NEXT:    v_mov_b32_e32 v35, s11
+; GFX9-NEXT:    v_mov_b32_e32 v36, s72
+; GFX9-NEXT:    v_mov_b32_e32 v34, s73
+; GFX9-NEXT:    v_mov_b32_e32 v32, s10
+; GFX9-NEXT:    v_mov_b32_e32 v33, s74
+; GFX9-NEXT:    v_mov_b32_e32 v31, s75
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v19, s7
-; GFX9-NEXT:    v_mov_b32_e32 v26, s6
-; GFX9-NEXT:    v_mov_b32_e32 v8, s53
-; GFX9-NEXT:    v_mov_b32_e32 v14, s70
-; GFX9-NEXT:    v_mov_b32_e32 v44, s69
-; GFX9-NEXT:    v_mov_b32_e32 v27, s65
-; GFX9-NEXT:    v_mov_b32_e32 v25, s68
-; GFX9-NEXT:    v_mov_b32_e32 v46, s67
+; GFX9-NEXT:    v_mov_b32_e32 v26, s53
+; GFX9-NEXT:    v_mov_b32_e32 v25, s52
+; GFX9-NEXT:    v_mov_b32_e32 v6, s70
+; GFX9-NEXT:    v_mov_b32_e32 v12, s7
+; GFX9-NEXT:    v_mov_b32_e32 v44, s6
+; GFX9-NEXT:    v_mov_b32_e32 v23, s71
+; GFX9-NEXT:    v_mov_b32_e32 v43, s67
+; GFX9-NEXT:    v_mov_b32_e32 v24, s69
+; GFX9-NEXT:    v_mov_b32_e32 v21, s68
+; GFX9-NEXT:    v_mov_b32_e32 v45, s64
 ; GFX9-NEXT:    v_mov_b32_e32 v22, s66
-; GFX9-NEXT:    v_mov_b32_e32 v45, s54
-; GFX9-NEXT:    v_mov_b32_e32 v23, s52
-; GFX9-NEXT:    v_mov_b32_e32 v56, s64
+; GFX9-NEXT:    v_mov_b32_e32 v19, s65
+; GFX9-NEXT:    v_mov_b32_e32 v47, s99
 ; GFX9-NEXT:    v_mov_b32_e32 v20, s55
-; GFX9-NEXT:    v_mov_b32_e32 v47, s97
-; GFX9-NEXT:    v_mov_b32_e32 v58, s99
-; GFX9-NEXT:    v_mov_b32_e32 v15, s98
-; GFX9-NEXT:    v_mov_b32_e32 v57, s86
-; GFX9-NEXT:    v_mov_b32_e32 v60, s96
-; GFX9-NEXT:    v_mov_b32_e32 v9, s87
-; GFX9-NEXT:    v_mov_b32_e32 v59, s83
-; GFX9-NEXT:    v_mov_b32_e32 v28, s85
-; GFX9-NEXT:    v_mov_b32_e32 v3, s84
-; GFX9-NEXT:    v_mov_b32_e32 v61, s80
-; GFX9-NEXT:    v_mov_b32_e32 v30, s82
-; GFX9-NEXT:    v_mov_b32_e32 v1, s81
-; GFX9-NEXT:    v_mov_b32_e32 v29, s71
-; GFX9-NEXT:    v_mov_b32_e32 v5, s78
-; GFX9-NEXT:    v_mov_b32_e32 v6, s88
-; GFX9-NEXT:    v_mov_b32_e32 v7, s90
-; GFX9-NEXT:    v_mov_b32_e32 v11, s92
-; GFX9-NEXT:    v_mov_b32_e32 v12, s94
-; GFX9-NEXT:    v_mov_b32_e32 v13, s30
+; GFX9-NEXT:    v_mov_b32_e32 v17, s54
+; GFX9-NEXT:    v_mov_b32_e32 v57, s96
+; GFX9-NEXT:    v_mov_b32_e32 v18, s98
+; GFX9-NEXT:    v_mov_b32_e32 v15, s97
+; GFX9-NEXT:    v_mov_b32_e32 v59, s85
+; GFX9-NEXT:    v_mov_b32_e32 v16, s87
+; GFX9-NEXT:    v_mov_b32_e32 v13, s86
+; GFX9-NEXT:    v_mov_b32_e32 v61, s82
+; GFX9-NEXT:    v_mov_b32_e32 v14, s84
+; GFX9-NEXT:    v_mov_b32_e32 v7, s83
+; GFX9-NEXT:    v_mov_b32_e32 v28, s80
+; GFX9-NEXT:    v_mov_b32_e32 v8, s81
+; GFX9-NEXT:    v_mov_b32_e32 v1, s78
+; GFX9-NEXT:    v_mov_b32_e32 v2, s88
+; GFX9-NEXT:    v_mov_b32_e32 v3, s90
+; GFX9-NEXT:    v_mov_b32_e32 v4, s92
+; GFX9-NEXT:    v_mov_b32_e32 v5, s94
+; GFX9-NEXT:    v_mov_b32_e32 v9, s30
 ; GFX9-NEXT:    v_readlane_b32 s11, v62, 10
 ; GFX9-NEXT:    v_readlane_b32 s12, v62, 11
 ; GFX9-NEXT:    v_readlane_b32 s13, v62, 12
@@ -175165,7 +175054,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_readlane_b32 s10, v62, 19
 ; GFX9-NEXT:    v_readlane_b32 s41, v62, 20
 ; GFX9-NEXT:    v_readlane_b32 s43, v62, 21
-; GFX9-NEXT:    v_readlane_b32 s47, v62, 22
+; GFX9-NEXT:    v_readlane_b32 s45, v62, 22
 ; GFX9-NEXT:    v_readlane_b32 s75, v62, 23
 ; GFX9-NEXT:    v_readlane_b32 s79, v62, 24
 ; GFX9-NEXT:    v_readlane_b32 s74, v62, 25
@@ -175190,7 +175079,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_readlane_b32 vcc_hi, v62, 44
 ; GFX9-NEXT:    v_readlane_b32 s56, v62, 45
 ; GFX9-NEXT:    v_readlane_b32 s30, v62, 46
-; GFX9-NEXT:    v_readlane_b32 s45, v62, 47
+; GFX9-NEXT:    v_readlane_b32 s47, v62, 47
 ; GFX9-NEXT:    v_readlane_b32 s8, v62, 48
 ; GFX9-NEXT:    v_readlane_b32 s7, v62, 49
 ; GFX9-NEXT:    v_readlane_b32 s35, v62, 9
@@ -175203,14 +175092,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_and_b32 s7, s8, 0xff
-; GFX9-NEXT:    s_lshl_b32 s8, s46, 8
+; GFX9-NEXT:    s_lshl_b32 s8, s44, 8
 ; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s17, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s45, 8
+; GFX9-NEXT:    s_lshl_b32 s7, s47, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s8, s30, 8
@@ -175218,8 +175107,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s18, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s56, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175229,8 +175118,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s19, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s57, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175240,8 +175129,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s20, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s58, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175251,8 +175140,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s21, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s59, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175262,8 +175151,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s22, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s60, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175273,8 +175162,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s23, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s61, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175284,8 +175173,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s24, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s62, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175295,8 +175184,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s25, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s63, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175306,8 +175195,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s26, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s72, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175317,8 +175206,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s27, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s73, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175328,8 +175217,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s28, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s74, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
@@ -175339,19 +175228,19 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s6, s29, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s75, 8
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    s_and_b32 s7, s77, 0xff
-; GFX9-NEXT:    s_lshl_b32 s8, s47, 8
+; GFX9-NEXT:    s_lshl_b32 s8, s45, 8
 ; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT:    v_mov_b32_e32 v31, s6
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    v_mov_b32_e32 v30, s6
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s6, s43, 8
 ; GFX9-NEXT:    s_or_b32 s4, s4, s6
@@ -175361,8 +175250,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s4, s4, s6
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT:    v_mov_b32_e32 v31, s4
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    v_mov_b32_e32 v30, s4
 ; GFX9-NEXT:    s_and_b32 s4, s5, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s10, 8
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
@@ -175372,13 +175261,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX9-NEXT:    s_or_b32 s4, s4, s5
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    v_mov_b32_e32 v31, s4
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    v_mov_b32_e32 v30, s4
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX9-NEXT:    v_or_b32_sdwa v11, v29, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT:    v_or_b32_sdwa v5, v58, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX9-NEXT:    v_or_b32_sdwa v4, v56, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT:    v_or_b32_sdwa v3, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX9-NEXT:    v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; GFX9-NEXT:    v_or_b32_sdwa v18, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
 ; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
 ; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
@@ -175416,103 +175315,93 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v8
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v29
+; GFX9-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 8, v28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v17
-; GFX9-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v61
-; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v8, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 8, v10
+; GFX9-NEXT:    v_or_b32_sdwa v7, v33, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v8, v27, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 8, v61
+; GFX9-NEXT:    v_or_b32_sdwa v7, v31, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v13
-; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v59
-; GFX9-NEXT:    v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v12
-; GFX9-NEXT:    v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v57
-; GFX9-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v11
-; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v47
-; GFX9-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v7
-; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v45
-; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v24
+; GFX9-NEXT:    v_or_b32_sdwa v8, v10, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 8, v9
+; GFX9-NEXT:    v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v8, v60, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 8, v59
+; GFX9-NEXT:    v_or_b32_sdwa v7, v34, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v8, v32, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v17
+; GFX9-NEXT:    v_or_b32_sdwa v7, v39, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v57
+; GFX9-NEXT:    v_or_b32_sdwa v5, v37, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v35, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v19
+; GFX9-NEXT:    v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v47
+; GFX9-NEXT:    v_or_b32_sdwa v4, v48, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v5, v38, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v21
+; GFX9-NEXT:    v_or_b32_sdwa v4, v53, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v45
+; GFX9-NEXT:    v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v4, v49, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v23
+; GFX9-NEXT:    v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v43
+; GFX9-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v3, v52, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v25
+; GFX9-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v27
-; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
-; GFX9-NEXT:    v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
-; GFX9-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
@@ -175555,26 +175444,26 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s97, 1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s62, v3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s63, v4
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s58, v5
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s60, v5
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s98, 2
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s59, v6
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s44, v7
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s45, v8
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s61, v6
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s58, v7
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s59, v8
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s35, 3
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s99, 3
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v11
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s44, v9
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s45, v10
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v11
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s36, 4
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s100, 4
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s7, v12
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v14
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s43, v12
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s40, v13
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s41, v14
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s37, 5
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s101, 5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 vcc_hi, 0
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s38, 6
@@ -175608,332 +175497,333 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s87, 31
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB91_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s27, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[46:47], s[24:25], 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 7
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s27, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[14:15], s[24:25], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 7
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s27, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s99, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s2, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s2, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s102, s1, 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 6
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s27, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 6
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s27, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s103, s1, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s104, s0, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s34, s0, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 5
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s35, s5, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s87, s5, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s54, s5, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 4
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s26, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s68, s4, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s69, s4, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s36, s7, 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 3
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s25, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s96, s7, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s39, s7, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s70, s6, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 2
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s49, s6, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s37, s9, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s9, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s55, s9, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s71, s8, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s80, s8, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 0
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 5
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s35, s41, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s87, s41, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s55, s41, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 4
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s26, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s69, s40, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s70, s40, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s36, s43, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 3
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s25, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s96, s43, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s38, s43, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s71, s42, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 2
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s48, s42, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s50, s45, 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 31
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s46, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s45, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s45, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s25, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s64, s45, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 30
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s23, 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s47, 9
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[46:47], s[22:23], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s81, s44, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 29
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s82, s44, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s38, s59, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s59, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 28
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s80, s44, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s81, s44, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s51, s59, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 31
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s24, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s14, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s59, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s65, s59, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s83, s58, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s84, s58, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 27
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s51, s63, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s63, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s66, s63, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 30
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s23, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s15, 9
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[14:15], s[22:23], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s82, s58, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 29
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s49, s58, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s37, s61, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s61, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 28
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s23, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s66, s61, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s83, s60, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s84, s60, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 27
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s52, s63, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s63, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s67, s63, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s22, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s85, s62, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s52, s73, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s53, s73, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s97, s73, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 23
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s48, s73, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s72, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s72, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 22
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s53, s29, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 23
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s21, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s39, s73, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s72, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s72, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 22
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s54, s29, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s98, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s67, s29, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 21
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s68, s29, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 21
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s21, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s86, s28, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[100:101], s[26:27], 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 20
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[56:57], s[2:3], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[60:61], s[0:1], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[74:75], s[4:5], 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 17
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s20, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[76:77], s[6:7], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[78:79], s[8:9], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[88:89], s[44:45], 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[90:91], s[58:59], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 20
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[46:47], s[2:3], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[56:57], s[0:1], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[74:75], s[40:41], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 17
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s20, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[76:77], s[42:43], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[78:79], s[44:45], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[88:89], s[58:59], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s19, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[90:91], s[60:61], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[92:93], s[62:63], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[94:95], s[72:73], 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 15
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 15
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s19, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[30:31], s[28:29], 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 14
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 14
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s19, 8
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 13
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s18, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 10
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s18, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 13
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 10
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 8
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s17, 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s17, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 9
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s17, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s17, 16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 7
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 6
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s16, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 7
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s17, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 6
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 5
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s16, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 4
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s3, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 5
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 4
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s3, 24
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 3
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s3, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 2
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s3, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 3
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 2
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s3, 8
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s1, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 0
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s62, 8
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s46, 25
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s47, 26
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[46:47], s[20:21], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s62, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s14, 25
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s15, 26
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[14:15], s[20:21], 24
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s46, 18
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s47, 19
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[46:47], s[18:19], 24
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s46, 11
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s47, 12
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[46:47], s[16:17], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s14, 18
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s15, 19
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[14:15], s[18:19], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s14, 11
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s15, 12
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[14:15], s[16:17], 24
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB91_4
 ; GFX11-TRUE16-NEXT:  .LBB91_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s29, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_and_b32 s42, s58, 0xffff0000
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s1, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s43, s58, 16
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s29, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s29, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s14, s58, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s15, s58, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 16
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s11
-; GFX11-TRUE16-NEXT:    s_and_b32 s13, s6, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s41, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s58, v6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s6, 16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    s_and_b32 s47, s59, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s41, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s77, s28, 0xffff0000
 ; GFX11-TRUE16-NEXT:    s_bfe_u32 s6, s58, 0x10010
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s46, s59, 16
-; GFX11-TRUE16-NEXT:    s_add_i32 s59, s6, s58
-; GFX11-TRUE16-NEXT:    s_and_b32 s75, s28, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s76, s28, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s74, s73, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s73, s73, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s61, s72, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s72, s72, 16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s78, s28, 16
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s6, s58
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s73, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s76, s73, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s74, s72, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s75, s72, 16
 ; GFX11-TRUE16-NEXT:    s_and_b32 s12, s63, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s60, s63, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s56, s62, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s57, s62, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s41, s45, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s28, s45, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s14, s44, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s15, s44, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s9, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s44, s8, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s45, s8, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s29, s7, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s40, s7, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s5, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s5, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s4, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s4, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s59, 0x7fff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s73, s63, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s63, s62, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s72, s62, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s62, s61, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s57, s61, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s47, s60, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s56, s60, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s46, s59, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s28, s59, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s11, s45, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s45, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s45, s44, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s44, s44, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s29, s43, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s43, s43, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s13, s42, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s42, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s40, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s40, 16
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s58, 22
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, s58, s59
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s40, s58, s41
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v2
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s4, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s40, 16
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s75
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s77
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s76
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s78
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s58, s1, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s1, 0x10010
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-TRUE16-NEXT:    s_add_i32 s58, s58, s1
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s1
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s1, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s58, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s1, s1, s58
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s1, s1, s41
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, s0, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s40
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v4, v6
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s40, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v51, 0x40c00000, s6
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s58, s4, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s40, 0x10010
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_add_i32 s58, s58, s4
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s4, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s58, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s59, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s40
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s40, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v7
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, s4, s58
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s40, s40, s41
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s73
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s76
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s74
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s5
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s58, s0, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s5, s0, 0x10010
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v4
-; GFX11-TRUE16-NEXT:    s_add_i32 s58, s58, s0
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s5, s0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s40, 16
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s0, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s58, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s59, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, s0, s58
-; GFX11-TRUE16-NEXT:    s_and_b32 s58, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, s0, s41
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, s3, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s58
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s40
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v6
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s58, v9
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s40, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v1, v5
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v7, 16, v3
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s59, s58, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s40, 0x10010
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX11-TRUE16-NEXT:    s_add_i32 s59, s59, s58
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s58, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s59, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s62, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s58, s58, s59
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s40
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s40, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s40, s40, s41
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v10
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s72
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s58, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s75
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s40, 16
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s3, v10
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s61
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s59, s3, 0x10010
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s74
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s3, 0x10010
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    s_add_i32 s59, s59, s3
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s3
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s3, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s59, 0x7fff
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v8, v9
-; GFX11-TRUE16-NEXT:    s_and_b32 s58, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s3, s3, s59
-; GFX11-TRUE16-NEXT:    s_and_b32 s58, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s3, s3, s41
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, s2, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v3
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s58
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s40
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s58, v3
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s40, v3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s59, s58, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s40, 0x10010
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX11-TRUE16-NEXT:    s_add_i32 s59, s59, s58
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s58, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s59, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s61, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s58, s58, s59
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s40
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s40, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s40, s40, s41
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s60
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s73
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v5
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s12
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s2, v3
@@ -175943,214 +175833,210 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
 ; GFX11-TRUE16-NEXT:    s_bfe_u32 s12, s2, 0x10010
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT:    s_add_i32 s59, s12, s2
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s58, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s59, 0x7fff
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s12, s2
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s40, 16
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s2, 22
-; GFX11-TRUE16-NEXT:    s_and_b32 s58, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s2, s2, s59
-; GFX11-TRUE16-NEXT:    s_and_b32 s58, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s2, s2, s41
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, s17, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v27
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v8
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v24, 16, v7
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s58
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s40
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v9, 16, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s58, v7
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s40, v7
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v4
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s59, s58, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s40, 0x10010
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    s_add_i32 s59, s59, s58
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s58, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s59, 0x7fff
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s40
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s40, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v5
-; GFX11-TRUE16-NEXT:    s_and_b32 s60, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s58, s58, s59
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s40, s40, s41
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s17, s17, 16
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s17
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v11
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s57
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s72
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s17, v8
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s56
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s63
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s56, s17, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s17, 0x10010
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-TRUE16-NEXT:    s_add_i32 s56, s56, s17
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s58, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s56, 0x7fff
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s17
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s40, 16
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s17, 22
-; GFX11-TRUE16-NEXT:    s_and_b32 s57, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s17, s17, s56
-; GFX11-TRUE16-NEXT:    s_and_b32 s56, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s17, s17, s41
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, s16, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v26
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v10, v6
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s56
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s40
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v25, 16, v5
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v9
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s56, v10
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s47
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s40, v10
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s17, 16
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v5, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s47, s56, 0x10010
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    s_add_i32 s47, s47, s56
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s56, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s47, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT:    s_and_b32 s57, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s47, s56, s47
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s40, 0x10010
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s40
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s40, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s62
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s40, s40, s41
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s16, s16, 16
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s16
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s40, 16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v15
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v8, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s16, v10
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s46
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v9, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s57
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s16, 0x10010
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s46, s16, 0x10010
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v29
-; GFX11-TRUE16-NEXT:    s_add_i32 s56, s46, s16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s47, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s56, 0x7fff
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s16
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s16, 22
-; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s16, s16, s56
-; GFX11-TRUE16-NEXT:    s_and_b32 s47, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s16, s16, s41
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, s19, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s47
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s40
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v6, 16, v9
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s42
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s47, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v29
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s16, s16, 16
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s40, v12
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s16, s16, 16
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s42, s47, 0x10010
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s43
-; GFX11-TRUE16-NEXT:    s_add_i32 s42, s42, s47
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s47, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s42, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s43, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s42, s47, s42
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s19, s19, 16
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v6, 16, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s47
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s40, 0x10010
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v8
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s40
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s40, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s40, s40, s41
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s19, s19, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s56
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v11
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s42, 16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v11
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s19, v12
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s19, v12
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s40, 16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v14
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s43, s19, 0x10010
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    s_add_i32 s43, s43, s19
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s19, 0x10010
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s19
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v11, v10
-; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s19, 22
-; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s19, s19, s43
-; GFX11-TRUE16-NEXT:    s_and_b32 s42, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s19, s19, s41
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, s18, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v7, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v10
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s42
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s40
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s19, s19, 16
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s15
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s40, v8
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s41
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s41, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s46
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s15
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s41, s40, 0x10010
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v7, 16, 1
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s42, s41, 0x10010
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    s_add_i32 s42, s42, s41
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s41, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT:    s_add_i32 s41, s41, s40
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s40, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s41, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s28
-; GFX11-TRUE16-NEXT:    s_and_b32 s43, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s41, s41, s42
+; GFX11-TRUE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s40, s40, s41
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s18, s18, 16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s18
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s40, 16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v28, 16, v8
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s18, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v7
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v28, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v5, 16, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
 ; GFX11-TRUE16-NEXT:    s_bfe_u32 s28, s18, 0x10010
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s14
-; GFX11-TRUE16-NEXT:    s_add_i32 s42, s28, s18
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s28, s41, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, s18
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s18, 22
-; GFX11-TRUE16-NEXT:    s_and_b32 s41, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s18, s18, s42
-; GFX11-TRUE16-NEXT:    s_and_b32 s41, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_addk_i32 s28, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s18, s18, s28
+; GFX11-TRUE16-NEXT:    s_and_b32 s28, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s28
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s14
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s41
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v11, 16, 1
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s18, s18, 16
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v9
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v11
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v10, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
 ; GFX11-TRUE16-NEXT:    s_bfe_u32 s15, s14, 0x10010
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v10, 16, 1
 ; GFX11-TRUE16-NEXT:    s_add_i32 s15, s15, s14
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s15, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s41, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s28, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cselect_b32 s14, s14, s15
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s15, s21, 16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s15
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s14, 16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v9, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s14, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s15, v13
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v8, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v9, v12
 ; GFX11-TRUE16-NEXT:    s_bfe_u32 s21, s15, 0x10010
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
 ; GFX11-TRUE16-NEXT:    s_add_i32 s21, s21, s15
@@ -176163,20 +176049,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v10
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s15
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v12
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s21, s14, 16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s0, s5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s11
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s11, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v3
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s2, s12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-TRUE16-NEXT:    s_bfe_u32 s15, s11, 0x10010
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v33
 ; GFX11-TRUE16-NEXT:    s_add_i32 s15, s15, s11
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s11, 22
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s15, 0x7fff
@@ -176186,35 +176072,35 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s14
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v33
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s11, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s9
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v8
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v34
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v8
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s14, s9, 0x10010
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v6, 16, v11
-; GFX11-TRUE16-NEXT:    s_add_i32 s14, s14, s9
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s9, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s9, s14, 0x10010
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    s_add_i32 s15, s9, s14
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s11, 16
+; GFX11-TRUE16-NEXT:    s_addk_i32 s15, 0x7fff
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v7
-; GFX11-TRUE16-NEXT:    s_and_b32 s15, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s11, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s9, s9, s14
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s11, s14, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s14, s23, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s20, s9, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s20, s11, 16
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s14
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v9
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s44
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s45
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v6
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s45
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s44
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -176223,14 +176109,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_add_i32 s15, s15, s14
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s15, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s9, s14, s15
+; GFX11-TRUE16-NEXT:    s_and_b32 s11, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s11, s14, s15
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s14, s23, 16
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v11, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v8, v10
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s9, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s11, 16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v9, v11
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -176242,12 +176128,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_add_i32 s15, s15, s14
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s15, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s11, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s9, s14, s15
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s11, s14, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s14, s22, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s23, s9, 16
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s40
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s23, s11, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s43
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s14
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
@@ -176264,15 +176150,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_add_i32 s15, s15, s14
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s15, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s9, s14, s15
+; GFX11-TRUE16-NEXT:    s_and_b32 s11, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s11, s14, s15
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s14, s22, 16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s14
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v39
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v10
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s29, s9, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s11, 16
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v12
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v9
@@ -176283,34 +176169,34 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_add_i32 s13, s13, s14
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s13, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s9, s14, s13
-; GFX11-TRUE16-NEXT:    s_and_b32 s13, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s15, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, s14, s13
+; GFX11-TRUE16-NEXT:    s_and_b32 s14, s25, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s14
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s10
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s10, v12
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s22, s9, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s22, s13, 16
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s13, s10, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s14, s10, 0x10010
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    s_add_i32 s13, s13, s10
+; GFX11-TRUE16-NEXT:    s_add_i32 s14, s14, s10
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s10, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s13, 0x7fff
+; GFX11-TRUE16-NEXT:    s_addk_i32 s14, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s9, s10, s13
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s10, s10, s14
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s13, s25, 16
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s8
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s13
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v7, v8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s9, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s10, 16
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v10, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v5
@@ -176318,19 +176204,19 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v10
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v10
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s10, s8, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s13, s8, 0x10010
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v11, 16, 1
-; GFX11-TRUE16-NEXT:    s_add_i32 s10, s10, s8
+; GFX11-TRUE16-NEXT:    s_add_i32 s13, s13, s8
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s8, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_addk_i32 s13, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s8, s8, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s8, s8, s13
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s24, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s25, s8, 16
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v48, v11
@@ -176340,39 +176226,39 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v11
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v37
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s10, s7, 0x10010
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v7
-; GFX11-TRUE16-NEXT:    s_add_i32 s9, s9, s7
+; GFX11-TRUE16-NEXT:    s_add_i32 s10, s10, s7
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s7, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x7fff
+; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_and_b32 s8, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s7, s7, s9
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s7, s7, s10
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s24, 16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s4
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v10, 16, 1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s7, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s7, 16
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v12
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v10
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v51, 16, 1
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s5, s8, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s4, s8, 0x10010
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v6
-; GFX11-TRUE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s8, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x7fff
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, s8, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, s27, 0xffff0000
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v52, 0x40c00000, s6
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v12, v51
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s24, s5, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s24, s4, 16
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v52
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc_lo
@@ -176383,8 +176269,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_add_i32 s7, s7, s6
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s6, 22
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, s6, s7
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s27, 16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v8, v11
@@ -176393,7 +176279,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s5, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s4, 16
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v66, v5, 16, v13
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v10
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc_lo
@@ -176408,56 +176294,52 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_bitset1_b32 s6, 22
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s26, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s27, s6, 16
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, s6, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s27, s4, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v51
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v55, v48, 16, v8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s1, s63
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s1, s58
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v54, v6, 16, v10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s3, s62
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s21, s72
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s20, s11
-; GFX11-TRUE16-NEXT:    s_add_i32 s9, s9, s8
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s8, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s40, s8, s9
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s26, 16
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s2, s12
-; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s18, s28
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s40, 16
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s17, s58
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s16, s46
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s26, v5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s3, s59
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s43, s17, s60
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s5, s6, 0x10010
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s42, s16, s42
+; GFX11-TRUE16-NEXT:    s_add_i32 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s6, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, s6, s5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s13, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s62
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s63
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s45, s27, s73
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s12, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[5:6], 24, v[54:55]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[6:7], 24, v[66:67]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[7:8], 24, v[70:71]
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s28, s26, 0x10010
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s11, s12, 0x10010
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[8:9], 24, v[18:19]
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, s26
-; GFX11-TRUE16-NEXT:    s_bitset1_b32 s26, 22
-; GFX11-TRUE16-NEXT:    s_addk_i32 s28, 0x7fff
-; GFX11-TRUE16-NEXT:    s_and_b32 s29, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s26, s26, s28
+; GFX11-TRUE16-NEXT:    s_add_i32 s11, s11, s12
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s12, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s11, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s12, s12, s11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[9:10], 24, v[16:17]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[10:11], 24, v[14:15]
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s26, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s26, s12, 16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[1:2]
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s25, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s27, s73
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s26, s40
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s24, s42
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s44, s26, s13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 24, v55
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 8, v55
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v54
@@ -176474,107 +176356,116 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v17
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v3
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v1
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s19, s59
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[88:89], s[40:41], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[94:95], s[28:29], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[90:91], s[10:11], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[92:93], s[14:15], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[56:57], s[6:7], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[60:61], s[4:5], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b64 s[30:31], s[12:13], 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s41, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s41, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s40, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s40, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s29, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s29, s29, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s28, s28, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s11, 24
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s61
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s72
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[92:93], s[44:45], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[94:95], s[8:9], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[30:31], s[6:7], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[14:15], s[42:43], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[46:47], s[40:41], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[56:57], s[28:29], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[36:37], s[10:11], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[38:39], s[4:5], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s45, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s45, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s44, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s44, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s11, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s11, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s10, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s10, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s10, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s15, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s15, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s14, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s14, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s13, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s13, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s12, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s12, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s9, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s9, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s9, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s8, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s8, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s8, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s95, s7, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s7, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s99, s6, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s6, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s102, s5, 24
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s103, s5, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s104, s4, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s34, s4, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s7, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s7, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s6, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s6, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s5, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s5, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s4, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s4, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s43, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s43, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s95, s42, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s42, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s31, s41, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s41, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s99, s40, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s40, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s102, s29, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s103, s29, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s104, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s34, s28, 8
 ; GFX11-TRUE16-NEXT:    s_branch .LBB91_5
 ; GFX11-TRUE16-NEXT:  .LBB91_3:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 0
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 0
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr104_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr102_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr99_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr86_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr13_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr68_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr98_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr85_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr66_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr83_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr80_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr71_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr70_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr69_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr68_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr100_lo16
@@ -176585,198 +176476,198 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr76_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 1
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 2
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 4
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 5
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 6
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 7
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 8
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 9
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 10
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 11
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s11, 12
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 13
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 14
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 15
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 17
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 18
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s11, 19
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 20
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 21
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 22
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 23
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 24
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 25
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s11, 26
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr11_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 27
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 28
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 29
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 30
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s10, 31
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 0
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 1
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 2
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 4
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 5
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 6
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s10, 7
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 11
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s5, 12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 13
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 15
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 18
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s5, 19
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 20
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 21
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 22
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 25
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s5, 26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr5_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 0
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4_lo16
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s74, 8
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s75, 9
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74_lo16
 ; GFX11-TRUE16-NEXT:    s_branch .LBB91_2
 ; GFX11-TRUE16-NEXT:  .LBB91_4:
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, s90
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, s92
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, s94
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s90, v43, 25
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s92, v43, 18
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s94, v42, 8
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, s30
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s91, v43, 26
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s93, v43, 19
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s95, v42, 9
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v43, 11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, s5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s94, v43, 25
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v43, 18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, s36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, s38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, s37
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, s39
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s95, v43, 26
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v43, 19
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v42, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v43, 11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, s40
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, s41
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, s87
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, s6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, s42
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, s43
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, s96
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, s14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, s44
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, s45
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, s15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, s58
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, s59
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, s40
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, s44
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, s58
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, s59
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, s60
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, s61
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, s10
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, s62
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, s63
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, s41
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, s11
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, s72
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, s73
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, s97
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, s28
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, s29
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, s98
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, s68
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, s69
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, s69
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, s70
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, s35
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, s54
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, s70
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, s49
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, s36
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, s39
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, s71
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.l, s80
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, s37
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, s55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.l, s81
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, s82
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.l, s50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, s64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, s55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, s71
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, s48
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, s80
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.l, s81
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, s50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, s64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.l, s82
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, s49
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.l, s51
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, s65
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, s83
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, s84
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, s38
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, s65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, s66
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.l, s85
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, s51
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, s66
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.l, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.l, s52
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, s48
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.l, s13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, s52
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, s67
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.l, s5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.l, s53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.l, s7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, s86
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, s53
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, s67
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, s54
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, s68
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, s74
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, s76
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, s78
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, s88
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s63, v43, 0
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s62, v43, 2
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s58, v43, 7
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s59, v43, 14
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s72, v43, 21
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s44, v43, 28
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s45, v42, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, s90
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, s92
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s58, v43, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s59, v43, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s60, v43, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s61, v43, 14
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s62, v43, 21
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s63, v43, 28
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s72, v42, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s73, v42, 6
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s43, v42, 7
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s41, v42, 5
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s47, v42, 4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s40, v42, 3
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s57, v42, 2
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s29, v42, 0
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s61, v43, 31
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s28, v43, 30
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s13, v42, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s15, v42, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s45, v42, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s44, v42, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s47, v42, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s11, v42, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s57, v43, 31
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s10, v43, 30
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s74, v43, 29
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s11, v43, 27
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s9, v43, 27
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s75, v43, 24
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s10, v43, 23
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s8, v43, 23
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s76, v43, 22
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s15, v43, 20
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s77, v43, 17
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s14, v43, 16
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s78, v43, 15
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s13, v43, 13
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s79, v43, 10
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s12, v43, 9
-; GFX11-TRUE16-NEXT:    s_mov_b32 s88, s100
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s89, v43, 8
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s9, v43, 6
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s91, v43, 5
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s8, v43, 4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s93, v43, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v43, 12
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s95, v43, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s77, v43, 20
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s78, v43, 17
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s79, v43, 16
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s88, v43, 15
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s89, v43, 13
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s90, v43, 10
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s91, v43, 9
+; GFX11-TRUE16-NEXT:    s_mov_b32 s92, s100
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s93, v43, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s43, v43, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s95, v43, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v42, 9
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s42, v43, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s41, v43, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v43, 12
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v43, 3
 ; GFX11-TRUE16-NEXT:  .LBB91_5: ; %end
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s34
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s104
-; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s60
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s56
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xff
@@ -176784,7 +176675,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s103
-; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s63
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s58
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s102
 ; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 8
@@ -176798,18 +176689,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s5
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s42
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s12
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s99
-; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s56
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s46
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s6
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s93
-; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s62
-; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s95
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s41
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s59
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s31
 ; GFX11-TRUE16-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, s6, 0xff
@@ -176825,9 +176716,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s8
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s91
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s46
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s42
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s95
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s14
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
@@ -176835,9 +176726,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s17
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s9
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s58
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s89
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s43
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s60
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s93
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xff
@@ -176851,9 +176742,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s18
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s12
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s79
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s30
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s91
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s90
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s38
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xff
@@ -176861,9 +176752,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s19
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s13
-; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s59
-; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s78
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s89
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s61
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s88
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, s6, 0xff
@@ -176879,9 +176770,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s20
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s14
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s77
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s92
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s79
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s78
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s30
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
@@ -176889,8 +176780,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s21
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s15
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s72
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s77
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s62
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s76
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
@@ -176905,9 +176796,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s22
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s8
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s75
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s90
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s94
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xff
@@ -176915,8 +176806,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s23
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s11
-; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s44
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s9
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s63
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s74
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 8
@@ -176936,9 +176827,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s24
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s28
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s61
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s94
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s57
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s36
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
@@ -176946,9 +176837,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s25
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s45
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s57
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s72
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s47
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xff
@@ -176962,9 +176853,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s26
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s40
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s47
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s88
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s44
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s45
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s92
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xff
@@ -176972,13 +176863,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s27
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s41
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s73
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, s4, s5
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, s6, 0xff
-; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s43
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s13
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
@@ -177219,20 +177110,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s59, v8
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s35, 3
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s99, 3
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v11
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s56, v9
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s57, v10
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s46, v11
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s36, 4
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s100, 4
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s7, v12
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v13
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s5, v14
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s47, v12
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v13
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s45, v14
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s37, 5
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s101, 5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 vcc_hi, 0
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s38, 6
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s102, 6
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s39, 7
@@ -177264,511 +177155,522 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s87, 31
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB91_3
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s27, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[42:43], s[22:23], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 13
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s27, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[16:17], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s1, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 12
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s27, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s27, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[26:27], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 15
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s100, s2, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s101, s1, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 14
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s27, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s1, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s0, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s0, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s5, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 14
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s26, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s5, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s5, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s4, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 15
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s26, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s4, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s7, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s98, s7, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s25, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s7, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s6, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s9, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 17
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s25, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s9, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s9, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s8, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 11
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s8, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s59, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s59, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 18
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s24, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s59, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s45, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s45, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 17
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s26, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s45, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s44, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s44, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 18
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s25, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s47, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s98, s47, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s47, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 19
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s46, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s57, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s57, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 13
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s25, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s57, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s56, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s56, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 20
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s59, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s59, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s59, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 21
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s24, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s71, s58, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s58, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 19
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s24, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s61, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s61, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s61, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 20
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s23, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s60, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s60, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s63, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 21
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s58, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s61, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 22
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s23, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s61, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s61, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s60, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 23
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s60, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s63, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s96, s63, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s63, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s62, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 10
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s62, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 12
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s23, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s63, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s62, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s62, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s22, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s36, s73, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s73, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 22
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s73, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s50, s73, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s72, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s72, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 23
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s22, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 25
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s22, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s72, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s72, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s34, s29, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s29, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 26
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s21, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s29, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s35, s29, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s21, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s28, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[100:101], s[26:27], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 25
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[102:103], s[24:25], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[56:57], s[0:1], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[74:75], s[4:5], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 9
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s21, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[76:77], s[6:7], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[78:79], s[8:9], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s28, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 27
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s28, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[14:15], s[16:17], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[40:41], s[2:3], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 11
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s21, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[42:43], s[0:1], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[74:75], s[44:45], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[76:77], s[46:47], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 28
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[78:79], s[56:57], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[88:89], s[58:59], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 26
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s20, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[90:91], s[60:61], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 29
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s20, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[92:93], s[62:63], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[72:73], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 27
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s20, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[28:29], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 28
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s19, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 30
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s19, 24
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 29
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 31
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 10
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s19, 8
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 30
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s18, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 31
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s18, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s18, 8
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s10, 0
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s17, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s10, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s17, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s17, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s17, 16
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s10, 2
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s16, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 9
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s17, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s16, 16
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s10, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s16, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s10, 4
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s3, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 5
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s16, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 6
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s3, 24
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s10, 5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s3, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s3, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s10, 7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s42, 4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s10, 6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s2, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s43, 5
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[42:43], s[20:21], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s10, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s1, 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s42, 2
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s10, 9
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s6, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s43, 3
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[42:43], s[18:19], 24
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s46, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s47, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[2:3], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 7
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s46, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s12, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s13, 7
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[24:25], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s12, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s13, 5
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[22:23], 24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s12, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s13, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[20:21], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s12, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s13, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[18:19], 24
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB91_4
 ; GFX11-FAKE16-NEXT:  .LBB91_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s29, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_and_b32 s13, s9, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s10
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s9, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s1, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s29, 16
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s76, s28, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s77, s28, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s29, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_and_b32 s14, s47, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s47, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 16
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-FAKE16-NEXT:    s_and_b32 s28, s58, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s29, s58, 16
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s58, v6
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-FAKE16-NEXT:    s_and_b32 s12, s6, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s6, 16
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s6, s58, 0x10010
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s45, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s47, v6
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s45, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s78, s28, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s6, s47, 0x10010
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s79, s28, 16
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s6, s47
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s73, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s77, s73, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s75, s72, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s76, s72, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s11, s63, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s74, s63, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s72, s62, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s73, s62, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s63, s61, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s62, s61, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s61, s60, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s60, s60, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s41, s59, 0xffff0000
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s40, s59, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s59, s6, s58
-; GFX11-FAKE16-NEXT:    s_and_b32 s74, s73, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s75, s73, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s73, s72, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s72, s72, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s63, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s63, s63, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s56, s62, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s57, s62, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s47, s61, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s45, s61, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s44, s60, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s46, s60, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s42, s8, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s8, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s14, s7, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s7, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s5, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s5, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s4, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s4, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s59, 0x7fff
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s58, 22
+; GFX11-FAKE16-NEXT:    s_and_b32 s28, s58, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s29, s58, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s13, s57, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s57, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s42, s56, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s56, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s12, s46, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s46, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s44, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s44, 16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s47, 22
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, s58, s59
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s47, s45
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v2
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s4, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s44, 16
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s76
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s78
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s77
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s79
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s58, s1, 0x10010
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s1, 0x10010
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-FAKE16-NEXT:    s_add_i32 s58, s58, s1
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s1
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s1, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s58, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, s1, s58
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s0, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, s1, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s44
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v7, 16, 1
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v4, v6
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s58, s4, 0x10010
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_add_i32 s58, s58, s4
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s4, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s58, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s59, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v5, v7
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, s4, s58
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s75
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v22
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s5
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s58, s0, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v23, 16, v4
-; GFX11-FAKE16-NEXT:    s_add_i32 s58, s58, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s77
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s5, s0, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v4
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s5, s0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s44, 16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s0, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s58, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s59, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, s0, s58
-; GFX11-FAKE16-NEXT:    s_and_b32 s58, s3, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s58
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, s0, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v8, v6
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v24
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s58, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v9
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v1, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v7, 16, v3
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s59, s58, 0x10010
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX11-FAKE16-NEXT:    s_add_i32 s59, s59, s58
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s58, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s59, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s60, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s58, s58, s59
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v6, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s72
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s58, 16
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s3, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s76
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s44, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s75
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s3, v10
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s73
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s59, s3, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s3, 0x10010
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    s_add_i32 s59, s59, s3
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s3
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s3, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s59, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v8, v9
-; GFX11-FAKE16-NEXT:    s_and_b32 s58, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s3, s3, s59
-; GFX11-FAKE16-NEXT:    s_and_b32 s58, s2, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s58
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s3, s3, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v8, v9
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v1
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s58, v3
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s59, s58, 0x10010
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX11-FAKE16-NEXT:    s_add_i32 s59, s59, s58
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s58, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s59, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s60, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s58, s58, s59
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v24
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s74
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v25, 16, v5
+; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s63
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v5
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s2, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s11
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v25
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s11, s2, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    s_add_i32 s59, s11, s2
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s58, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s59, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s11, s2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s44, 16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s2, 22
-; GFX11-FAKE16-NEXT:    s_and_b32 s58, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s2, s2, s59
-; GFX11-FAKE16-NEXT:    s_and_b32 s58, s17, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v6, v3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v26, 16, v7
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s58
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v4, 16, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s58, v7
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s2, s2, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s59, s58, 0x10010
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    s_add_i32 s59, s59, s58
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s58, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s59, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v5
-; GFX11-FAKE16-NEXT:    s_and_b32 s60, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s58, s58, s59
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s57
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s73
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s72
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s17, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s72, s44, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s17, 0x10010
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s17, v6
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s56
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s56, s17, 0x10010
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-FAKE16-NEXT:    s_add_i32 s56, s56, s17
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s58, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s56, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v27
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s17
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s17, 22
-; GFX11-FAKE16-NEXT:    s_and_b32 s57, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s17, s17, s56
-; GFX11-FAKE16-NEXT:    s_and_b32 s56, s16, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v28
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v10, v3
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s56
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v29, 16, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s56, v10
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v28, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s17, s17, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s16, 0xffff0000
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s17, s17, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s47
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s47, s56, 0x10010
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    s_add_i32 s47, s47, s56
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s56, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s47, 0x7fff
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-FAKE16-NEXT:    s_and_b32 s57, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s47, s56, s47
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s63
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v5, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v29
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v8, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s16, s16, 16
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v7, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s16, v10
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s45
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s44, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s16, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s62
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s16, 0x10010
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-FAKE16-NEXT:    s_add_i32 s56, s45, s16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s47, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s56, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s16
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s16, 22
-; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s16, s16, s56
-; GFX11-FAKE16-NEXT:    s_and_b32 s47, s19, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s47
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v3, 16, v7
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s44
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s47, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v10, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s16, s16, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s16, s16, 16
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s46
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s44, s47, 0x10010
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v6
-; GFX11-FAKE16-NEXT:    s_add_i32 s44, s44, s47
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s47, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s44, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s47, s44
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s60
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v4
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s61
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s19, s19, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v11
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v10, 16, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s44, 16
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s19, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v7, 16, 1
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s46, s19, 0x10010
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v11, v10
-; GFX11-FAKE16-NEXT:    s_add_i32 s46, s46, s19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s44, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s19, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v9, v8
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s19, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s19
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s19, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s46, 0x7fff
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s19, s19, s46
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s19, s19, s45
 ; GFX11-FAKE16-NEXT:    s_and_b32 s44, s18, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v5, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v10
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s44
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v3, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s19, s19, 16
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s41
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s41, v6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s41
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s41, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s47, s17, s72
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s44, s41, 0x10010
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    s_add_i32 s44, s44, s41
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s41, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s44, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s40
-; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s40
+; GFX11-FAKE16-NEXT:    s_and_b32 s45, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s41, s41, s44
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s18, s18, 16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v32
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s18
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v33
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v31, 16, v6
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s18, v7
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v3, 16, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v30, 16, v4
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s18, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v1, 16, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s40, s18, 0x10010
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s28
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s28
 ; GFX11-FAKE16-NEXT:    s_add_i32 s44, s40, s18
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s41, 16
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s44, 0x7fff
@@ -177776,105 +177678,105 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    s_and_b32 s41, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s18, s18, s44
 ; GFX11-FAKE16-NEXT:    s_and_b32 s41, s21, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s41
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s41
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v9, 16, 1
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s18, s18, 16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s28, v7
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v11
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v10, 16, 1
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s28, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v10, 16, 1
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s29, s28, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
 ; GFX11-FAKE16-NEXT:    s_add_i32 s29, s29, s28
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s28, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s29, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s41, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s28, s28, s29
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s21, s21, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s21
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s28, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v5, v10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s44, s2, s11
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s21, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s28, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v7, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v19
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s21, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v6, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v4, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s29, s21, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
 ; GFX11-FAKE16-NEXT:    s_add_i32 s29, s29, s21
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s21, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s29, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s28, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s21, s21, s29
 ; GFX11-FAKE16-NEXT:    s_and_b32 s28, s20, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v10
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s28
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s28
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v10
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s21, s21, 16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v18
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s13
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s45, s3, s59
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s46, s16, s46
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s13
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s28, s13, 0x10010
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v34
 ; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, s13
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s13, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s28, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s29, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s13, s28
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s20, s20, 16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s20
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s10
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v36
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s20, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v5
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v7, 16, 1
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v4, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s20, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v2, 16, v9
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s10, s20, 0x10010
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
 ; GFX11-FAKE16-NEXT:    s_add_i32 s28, s10, s20
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s13, 16
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s28, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s20, 22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s20, s28
 ; GFX11-FAKE16-NEXT:    s_and_b32 s20, s23, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v6, v7
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s42
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s42
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s43
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s28, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v19
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s20
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v7
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s43
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s28, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s20, s28, 0x10010
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v8, 16, 1
 ; GFX11-FAKE16-NEXT:    s_add_i32 s29, s20, s28
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s20, s13, 16
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s29, 0x7fff
@@ -177882,241 +177784,235 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s28, s29
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s23, s23, 16
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v11, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s23
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v6, v10
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s63, s13, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v7, v11
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s23, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v4, v8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s13, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v5, v9
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s23, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s28, s23, 0x10010
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
 ; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, s23
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s23, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s28, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s23, s28
 ; GFX11-FAKE16-NEXT:    s_and_b32 s23, s22, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v37
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s23
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s14
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s14
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s23, s13, 16
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v10, 16, 1
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s14, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v71, v38, 16, v6
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v71, v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s12
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s15, s14, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
 ; GFX11-FAKE16-NEXT:    s_add_i32 s15, s15, s14
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s14, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s15, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s14, s15
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s14, s22, 16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s14
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v39
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v10
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s28, s13, 16
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s14, v14
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v70, v4, 16, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v38
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s13, 16
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s14, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v70, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v9
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s12, s14, 0x10010
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v8
 ; GFX11-FAKE16-NEXT:    s_add_i32 s12, s12, s14
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s14, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s12, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_and_b32 s15, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s12, s14, s12
-; GFX11-FAKE16-NEXT:    s_and_b32 s13, s25, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s13
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s9, v14
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_and_b32 s14, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s22, s12, 16
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v4
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s13, s9, 0x10010
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    s_add_i32 s13, s13, s9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s14, s9, 0x10010
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_add_i32 s14, s14, s9
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s9, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s13, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v3
+; GFX11-FAKE16-NEXT:    s_addk_i32 s14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX11-FAKE16-NEXT:    s_and_b32 s12, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s9, s9, s13
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s9, s9, s14
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s25, 16
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s8
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v5, v6
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s72, s9, 16
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v10, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v6
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s63, s9, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s12, s8, 0x10010
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v9, 16, 1
 ; GFX11-FAKE16-NEXT:    s_add_i32 s12, s12, s8
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s8, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s12, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s9, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s8, s8, s12
 ; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s25, s8, 16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s9
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v16, v11
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s6
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s7, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v15, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v12, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s28, s0, s5
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s9, s7, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v3
 ; GFX11-FAKE16-NEXT:    s_add_i32 s9, s9, s7
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s7, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s8, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s7, s7, s9
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s24, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v6
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s8
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s5
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v10, 16, 1
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s7, 16
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s8, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v6, v10
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v16, 16, 1
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s5, s8, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-FAKE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s7, 16
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s8, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v12, 16, 1
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s4, s8, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v2
+; GFX11-FAKE16-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s8, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, s8, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s6, s27, 0xffff0000
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v6
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v10
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v14, v16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s24, s5, 16
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v52, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v12
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s24, s4, 16
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v9, 16, 1
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s7, s6, 0x10010
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v50
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-FAKE16-NEXT:    s_add_i32 s7, s7, s6
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s6, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, s6, s7
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s27, 16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v6, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v16
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s6
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s73, s5, 16
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v66, v3, 16, v15
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v4, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s73, s4, 16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v49
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v51
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v66, v1, 16, v11
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s7, s6, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-FAKE16-NEXT:    s_add_i32 s7, s7, s6
 ; GFX11-FAKE16-NEXT:    s_bitset1_b32 s6, 22
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s26, 0xffff0000
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s27, s6, 16
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v53
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s22, s28
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v55, v51, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v67, v49, 16, v7
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s8, v5
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v54, v4, 16, v10
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s18, s40
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[12:13]
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[8:9]
-; GFX11-FAKE16-NEXT:    s_add_i32 s9, s9, s8
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s8, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s29, s8, s9
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s26, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[10:11], 24, v[18:19]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[16:17], 24, v[1:2]
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s1, s61
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s3, s62
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s26, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, s6, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s27, s4, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v55, v50, 16, v4
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s13
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v3
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[54:55]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[4:5], 24, v[66:67]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[5:6], 24, v[70:71]
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s28, s26, 0x10010
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[6:7], 24, v[20:21]
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, s26
-; GFX11-FAKE16-NEXT:    s_bitset1_b32 s26, 22
-; GFX11-FAKE16-NEXT:    s_addk_i32 s28, 0x7fff
-; GFX11-FAKE16-NEXT:    s_and_b32 s29, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s26, s26, s28
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s2, s11
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s26, s26, 16
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s19, s59
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s11, s21, s60
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s10, s20, s10
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s23, s63
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s29, s25, s72
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s41, s27, s73
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s40, s26, s40
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s28, s24, s42
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v55
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 8, v55
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v54
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v54, v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v67, v48, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[8:9], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s5, s6, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[9:10], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    s_add_i32 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s6, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s14, s6, s5
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s26, 16
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s10
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s14, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[10:11], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[6:7]
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s29, s1, s58
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[1:2], 24, v[54:55]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[2:3], 24, v[66:67]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[70:71]
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s10, s11, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[4:5], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    s_add_i32 s10, s10, s11
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s11, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s60
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s26, s10, 16
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s62
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 24, v55
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 8, v55
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v54
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v67
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v67
@@ -178126,114 +178022,120 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 8, v71
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v70
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v21
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v20
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s9, s17, s58
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s16, s45
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[92:93], s[40:41], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 vcc, s[28:29], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[90:91], s[14:15], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[10:11], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[42:43], s[12:13], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[56:57], s[4:5], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[8:9], 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s41, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s41, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s40, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s40, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s47, s29, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s29, s29, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s57, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s28, s28, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s74, s15, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s15, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s75, s14, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s14, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s76, s11, 24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s61
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s63
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s57, s27, s73
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s56, s26, s13
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s12
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[8:9], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[4:5], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[14:15], s[46:47], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[40:41], s[44:45], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[42:43], s[28:29], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 vcc, s[56:57], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[34:35], s[10:11], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[6:7], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s57, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s57, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s56, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s56, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s56, s11, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s11, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s77, s10, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s57, s10, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s10, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s78, s13, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s13, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s79, s12, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s12, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s88, s9, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s74, s9, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s9, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s89, s8, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s75, s8, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s8, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s91, s7, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s95, s7, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s93, s6, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 vcc_hi, s6, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s5, 24
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s5, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s4, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s76, s7, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s77, s7, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s78, s6, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s79, s6, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s88, s5, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s89, s5, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s90, s4, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s91, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s92, s47, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s47, s47, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s93, s46, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s46, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s95, s45, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s45, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s44, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s100, s44, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s101, s29, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s29, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s28, 8
 ; GFX11-FAKE16-NEXT:    s_branch .LBB91_5
 ; GFX11-FAKE16-NEXT:  .LBB91_3:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr104
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr44
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr56
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr99
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr41
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr103
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr37
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr35
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr34
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr51
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr53
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr50
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr7
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr36
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr55
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr64
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr53
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr96
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr38
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr66
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr67
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr39
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr14
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr54
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr70
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr67
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr68
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr15
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr65
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr66
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr83
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr80
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr97
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr69
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr85
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr83
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr70
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr81
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr86
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr87
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr40
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr84
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr102
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr85
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr30
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr94
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr92
@@ -178241,176 +178143,169 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr88
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr78
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr76
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s5, 1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s5, 3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s74, 4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s75, 5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s74, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s75, 7
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr74
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s11, 1
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 2
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s11, 3
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s10, 4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s11, 5
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr11
 ; GFX11-FAKE16-NEXT:    s_branch .LBB91_2
 ; GFX11-FAKE16-NEXT:  .LBB91_4:
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s74 :: v_dual_mov_b32 v10, s90
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s94 :: v_dual_mov_b32 v16, s30
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s90, v42, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s94, v42, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s91, v42, 5
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s95, v42, 3
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v53, s4 :: v_dual_mov_b32 v52, s5
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v51, s40 :: v_dual_mov_b32 v50, s6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, s7 :: v_dual_mov_b32 v49, s98
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v39, s8 :: v_dual_mov_b32 v38, s97
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, s9 :: v_dual_mov_b32 v36, s58
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, s59 :: v_dual_mov_b32 v34, s15
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, s60 :: v_dual_mov_b32 v32, s61
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v30, s62
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s63 :: v_dual_mov_b32 v29, s96
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, s72 :: v_dual_mov_b32 v26, s13
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v25, s73 :: v_dual_mov_b32 v24, s28
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s12
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s87 :: v_dual_mov_b32 v54, s86
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s84 :: v_dual_mov_b32 v66, s85
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v64, s83
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v65, s10 :: v_dual_mov_b32 v70, s81
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v55, s48 :: v_dual_mov_b32 v68, s80
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v69, s82 :: v_dual_mov_b32 v80, s71
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v67, s69 :: v_dual_mov_b32 v20, s70
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v71, s65 :: v_dual_mov_b32 v82, s67
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v21, s68 :: v_dual_mov_b32 v18, s66
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v81, s54 :: v_dual_mov_b32 v84, s64
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, s39 :: v_dual_mov_b32 v12, s55
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v83, s38 :: v_dual_mov_b32 v86, s52
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s53 :: v_dual_mov_b32 v8, s51
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v96, s49
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s50 :: v_dual_mov_b32 v2, s35
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v4, s76
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v87, s34 :: v_dual_mov_b32 v6, s88
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s78 :: v_dual_mov_b32 v14, s92
-; GFX11-FAKE16-NEXT:    s_mov_b32 s61, s41
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s62, v42, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s58, v42, 7
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s59, v42, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s60, v42, 9
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s63, v42, 10
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s72, v42, 11
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s73, v42, 12
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s43, v42, 13
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s41, v42, 14
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s45, v42, 15
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s40, v42, 16
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s47, v42, 17
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s29, v42, 18
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s57, v42, 19
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s28, v42, 20
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s74, v42, 21
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s15, v42, 22
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s75, v42, 23
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s14, v42, 24
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s76, v42, 25
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s11, v42, 26
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s77, v42, 27
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s10, v42, 28
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s78, v42, 29
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s13, v42, 30
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s79, v42, 31
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s12, v43, 0
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s88, v43, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s9, v43, 2
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s89, v43, 3
-; GFX11-FAKE16-NEXT:    s_mov_b32 s92, s100
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s8, v43, 4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s91, v43, 5
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s93, v43, 6
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s95, v43, 7
-; GFX11-FAKE16-NEXT:    s_mov_b32 vcc_lo, s102
-; GFX11-FAKE16-NEXT:    v_readlane_b32 vcc_hi, v43, 8
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s7, v43, 9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s94, v43, 2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s95, v43, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 vcc_lo, v43, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v43, 4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
+; GFX11-FAKE16-NEXT:    s_mov_b32 s58, s11
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s59, v43, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s72, v43, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s60, v43, 10
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s61, v43, 11
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s62, v43, 12
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s63, v43, 13
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s73, v43, 14
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s13, v43, 15
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s15, v43, 16
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s41, v43, 17
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s43, v43, 18
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s56, v43, 19
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s11, v43, 20
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s57, v43, 21
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s10, v43, 22
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s74, v43, 23
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s9, v43, 24
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s75, v43, 25
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s8, v43, 26
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s76, v43, 27
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s77, v43, 28
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s78, v43, 29
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s79, v43, 30
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s88, v43, 31
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s89, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s90, v42, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s91, v42, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s92, v42, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s47, v42, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s93, v42, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 vcc_hi, v43, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s46, v42, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v43, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s95, v42, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s45, v42, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v43, 5
 ; GFX11-FAKE16-NEXT:  .LBB91_5: ; %end
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s104, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s44, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s56, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s103, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s42, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s6
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s99, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s61, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s102, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s58, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s101, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s7
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
@@ -178420,15 +178315,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s5
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, vcc_hi, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s93, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s46, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s100, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s99, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s40, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s6
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s95, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s62, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s91, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s45, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s59, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s95, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s7
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
@@ -178440,15 +178335,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s8, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s89, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s30, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s46, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s93, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s14, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s17, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s9, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s58, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s88, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s47, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s72, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s92, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
@@ -178458,15 +178353,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s12, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s79, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s42, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s91, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s90, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s12, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s19, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s13, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s59, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s78, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s89, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s60, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s88, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s4, s4, s5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s7
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
@@ -178478,14 +178373,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s10, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s77, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s94, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s79, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s78, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s30, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s21, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s11, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s60, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s77, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s61, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s76, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
@@ -178496,14 +178391,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s14, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s8, 8
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s75, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s90, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s94, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s23, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s15, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s63, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s9, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s62, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s74, 8
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
@@ -178519,32 +178414,32 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s28, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s10, 8
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s57, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, vcc_lo, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s34, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s29, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s11, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s25, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s72, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s47, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s63, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s56, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s40, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s43, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s45, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s92, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s41, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s41, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, vcc_lo, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s15, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s27, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s43, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s13, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s4, s4, s5
 ; GFX11-FAKE16-NEXT:    s_and_b32 s5, s73, 0xff
 ; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
@@ -178552,160 +178447,160 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v113, s1 :: v_dual_and_b32 v24, 0xff, v24
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v112, s0 :: v_dual_lshlrev_b32 v1, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6
 ; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11
 ; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v24, v1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v114, s2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v96, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v23, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v96, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v25, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v1, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 8, v87
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v84
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v16, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v23, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 8, v85
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v30
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v27, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v6, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v22, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v26, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v24, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v27
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v29
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 8, v83
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v23, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v25, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v26, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v27, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v22, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v24, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v26, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v1, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v8, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v9, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v82
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v11, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v15, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v30
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 8, v81
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v8, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v10, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v15, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v17, v18
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v80
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 8, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v69
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v14, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 8, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 8, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v14, v4
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v15, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v21
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v22, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v21, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v1, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v8, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v10, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v18, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v68
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v38
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 8, v67
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 8, v66
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v65
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v5, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v16, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v19, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v20, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 8, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v67
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v49
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 8, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v53
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v54
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v51
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v19, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v17, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v21, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v22, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v18, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v19, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v20, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v21, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v22, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v1, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v6, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v17, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v18, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v10, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v11, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v17, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v12, v5
 ; GFX11-FAKE16-NEXT:    s_clause 0x5
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[97:100], off offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[112:115], off offset:48
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[24:27], off offset:64
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[12:15], off offset:80
-; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[8:11], off offset:96
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[23:26], off offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[6:9], off offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s104, v41, 8
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s103, v41, 7
@@ -178794,40 +178689,39 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v51, v3
-; SI-NEXT:    v_mov_b32_e32 v49, v1
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v0
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:392
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v44, v19
+; SI-NEXT:    v_mov_b32_e32 v43, v17
+; SI-NEXT:    v_mov_b32_e32 v32, v14
+; SI-NEXT:    v_mov_b32_e32 v14, v12
+; SI-NEXT:    v_mov_b32_e32 v12, v10
+; SI-NEXT:    v_mov_b32_e32 v41, v7
+; SI-NEXT:    v_mov_b32_e32 v55, v5
+; SI-NEXT:    v_mov_b32_e32 v54, v3
+; SI-NEXT:    v_mov_b32_e32 v51, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:392
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:124
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:140
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:148
@@ -178836,135 +178730,129 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:172
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:188
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:128
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:128
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:136
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v18
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v20
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v22
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v24
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v28
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v30
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 8, v32
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v27
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v49
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v11
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v52
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v15
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v53
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v25
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v27
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v40
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v32
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v42
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:136
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:144
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v34
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v23
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v35
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v31
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v36
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:168
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:176
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:168
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:176
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v38
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v39
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v48
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212
@@ -178975,27 +178863,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:200
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:208
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244
@@ -179003,78 +178891,82 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:224
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:232
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:256
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:272
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:288
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 8, v1
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 8, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:296
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:304
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:340
@@ -179090,64 +178982,79 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 8, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:328
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:336
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:356
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:364
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:372
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:380
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:352
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:360
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:368
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:352
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 8, v1
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:384
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v2
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:360
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:368
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:384
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:120
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:152
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:184
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:152
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:184
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:216
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:248
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:248
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:280
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:312
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:344
@@ -179160,265 +179067,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; SI-NEXT:    s_cbranch_execz .LBB92_2
-; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v11, v7
-; SI-NEXT:    v_mov_b32_e32 v7, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; SI-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; SI-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; SI-NEXT:    v_and_b32_e32 v34, 0xff, v34
-; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    v_and_b32_e32 v36, 0xff, v36
-; SI-NEXT:    v_and_b32_e32 v50, 0xff, v50
-; SI-NEXT:    v_and_b32_e32 v52, 0xff, v52
-; SI-NEXT:    v_and_b32_e32 v42, 0xff, v42
-; SI-NEXT:    v_or_b32_e32 v54, v42, v54
-; SI-NEXT:    v_and_b32_e32 v53, 0xff, v53
-; SI-NEXT:    v_and_b32_e32 v41, 0xff, v41
-; SI-NEXT:    v_or_b32_e32 v27, v41, v27
-; SI-NEXT:    v_and_b32_e32 v41, 0xff, v57
-; SI-NEXT:    v_or_b32_e32 v1, v41, v1
-; SI-NEXT:    v_and_b32_e32 v41, 0xff, v47
-; SI-NEXT:    v_or_b32_e32 v3, v41, v3
-; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v0, v63
-; SI-NEXT:    v_and_b32_e32 v56, 0xff, v56
-; SI-NEXT:    v_and_b32_e32 v55, 0xff, v55
-; SI-NEXT:    v_or_b32_e32 v56, v56, v61
-; SI-NEXT:    v_or_b32_e32 v55, v55, v62
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_or_b32_e32 v2, v2, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v51
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_or_b32_e32 v2, v2, v6
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v7
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v10
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v12
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v14
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v6
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v2, v2, v6
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v2, v2, v6
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v23, v2, v6
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v7, v2, v6
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v31, v2, v6
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v55
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v49, v2, v6
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v49
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v25, v2, v6
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v0
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; kill: killed $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr0
@@ -179492,483 +179140,748 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; kill: killed $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
 ; SI-NEXT:    ; kill: killed $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz .LBB92_2
+; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v51
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; SI-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; SI-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; SI-NEXT:    v_and_b32_e32 v34, 0xff, v34
+; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; SI-NEXT:    v_and_b32_e32 v39, 0xff, v50
+; SI-NEXT:    v_and_b32_e32 v48, 0xff, v40
+; SI-NEXT:    v_and_b32_e32 v49, 0xff, v49
+; SI-NEXT:    v_and_b32_e32 v52, 0xff, v52
+; SI-NEXT:    v_and_b32_e32 v42, 0xff, v42
+; SI-NEXT:    v_and_b32_e32 v46, 0xff, v46
+; SI-NEXT:    v_or_b32_e32 v45, v46, v45
+; SI-NEXT:    v_and_b32_e32 v56, 0xff, v56
+; SI-NEXT:    v_or_b32_e32 v56, v56, v61
+; SI-NEXT:    v_and_b32_e32 v57, 0xff, v57
+; SI-NEXT:    v_and_b32_e32 v47, 0xff, v47
+; SI-NEXT:    v_or_b32_e32 v1, v57, v1
+; SI-NEXT:    v_or_b32_e32 v3, v47, v3
+; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v2, v0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v54
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v2, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v55
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v2, v6
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v41
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v2, v8
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v9
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v2, v12
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v11
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v2, v14
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v13
+; SI-NEXT:    v_mov_b32_e32 v8, v7
+; SI-NEXT:    v_mov_b32_e32 v7, v19
+; SI-NEXT:    v_or_b32_e32 v19, v2, v32
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v15
+; SI-NEXT:    v_and_b32_e32 v35, 0xff, v7
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_or_b32_e32 v4, v2, v4
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v17, v2, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v43
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v29, v2, v6
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v23, v2, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v44
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v29
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v56
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT:    v_or_b32_e32 v32, v6, v8
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v31, v2, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v21
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v32
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v31
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_or_b32_e32 v11, v6, v8
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v51, v2, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v27
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v51
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v33, v6, v8
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v27, v2, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v29, v2, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v29
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v2, v2, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v5, v5, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v4, v4, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v4
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v33, v6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v45
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v6
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v18, v18, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v8, v8, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v12, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v12, v12, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v12
+; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v14, v14, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v14
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v32, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v32, v32, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v18, v18, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v18
 ; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v22, v22, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v22, v22, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v22
 ; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v24, v24, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v24, v24, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v24
 ; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v26, v26, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v26, v26, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v26
 ; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v28, v28, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v28, v28, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v28
 ; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v30, v30, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v30, v30, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v30
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v34, v34, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v34, v34, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v34
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v16, v16, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v16, v16, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v16
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v20, v20, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v20, v20, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v20
 ; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v35, v35, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v35
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v36, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v13, v13, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v36, v36, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v56
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v36
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v37, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v37, v37, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v37
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT:    v_or_b32_e32 v10, v10, v12
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v38, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v38, v38, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v10
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v38
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v36, v36, v12
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v15
-; SI-NEXT:    v_or_b32_e32 v12, v12, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v39, v39, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v39
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v7, v7, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v12
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_or_b32_e32 v15, v15, v14
-; SI-NEXT:    v_and_b32_e32 v14, 0xff, v19
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_or_b32_e32 v14, v14, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v7
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v48, v48, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v15
-; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v48
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v49, v49, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v14
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_or_b32_e32 v50, v50, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v50, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v50, v50, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v50
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    v_or_b32_e32 v17, v17, v19
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v17
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v9, v9, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v19
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v9
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v35, 0xff, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v54, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v35, v35, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v54, v54, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v52, v52, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v52, v52, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v52
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v37, 0xff, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v37, v37, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v37
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v38, 0xff, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v38, v38, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v11, v11, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v38
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v11
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v39, 0xff, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v53, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v39, v39, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v39
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v53, v53, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v53, v53, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v53
 ; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; SI-NEXT:    v_or_b32_e32 v21, v21, v48
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v3
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v48, 0xff, v48
-; SI-NEXT:    v_or_b32_e32 v48, v48, v51
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v48
-; SI-NEXT:    v_mov_b32_e32 v48, v2
+; SI-NEXT:    v_and_b32_e32 v55, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v51, 0xff, v51
-; SI-NEXT:    v_or_b32_e32 v51, v51, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v51
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v42, 0xff, v42
-; SI-NEXT:    v_or_b32_e32 v40, v42, v40
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v40
+; SI-NEXT:    v_or_b32_e32 v55, v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v13, v13, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v41, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v41, v41, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v40, 0xff, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v40, v40, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v40
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v42, 0xff, v42
-; SI-NEXT:    v_or_b32_e32 v42, v42, v43
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v42
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v42, v42, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v42
 ; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v43, 0xff, v43
-; SI-NEXT:    v_or_b32_e32 v43, v43, v44
-; SI-NEXT:    v_and_b32_e32 v44, 0xff, v46
-; SI-NEXT:    v_or_b32_e32 v44, v44, v45
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v43
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v43, 0xff, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v43, v43, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v45, 0xff, v45
-; SI-NEXT:    v_or_b32_e32 v45, v45, v58
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v46, 0xff, v46
-; SI-NEXT:    v_or_b32_e32 v46, v46, v59
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v58, 0xff, v58
-; SI-NEXT:    v_or_b32_e32 v58, v58, v60
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v58
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; SI-NEXT:    v_or_b32_e32 v15, v15, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v15
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v44, 0xff, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v44, v44, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v46, 0xff, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v46, v46, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v46
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v58, 0xff, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v58, v58, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v58
 ; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v59, 0xff, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v59, v59, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v59
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v60, 0xff, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v60, v60, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v60
 ; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v61, 0xff, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v25, v61, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v25
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v61, 0xff, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v61, v61, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v46
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v44
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v55
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; kill: killed $vgpr0
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v13
+; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v41
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:  .LBB92_2: ; %Flow
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB92_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v47
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v9, v3, v2
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_or_b32_e32 v5, v3, v2
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v57
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v11, v63, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v41
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v13, v27, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v55
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v21, v62, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v56
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v23, v61, v2
+; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v19
+; SI-NEXT:    v_and_b32_e32 v29, 0xff, v29
+; SI-NEXT:    v_add_i32_e32 v20, vcc, 3, v20
+; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_add_i32_e32 v34, vcc, 3, v34
+; SI-NEXT:    v_and_b32_e32 v34, 0xff, v34
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v30
-; SI-NEXT:    v_and_b32_e32 v29, 0xff, v29
+; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
+; SI-NEXT:    v_and_b32_e32 v30, 0xff, v30
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v28
 ; SI-NEXT:    v_and_b32_e32 v28, 0xff, v28
@@ -179986,737 +179899,780 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
 ; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    s_movk_i32 s6, 0x300
-; SI-NEXT:    v_add_i32_e32 v63, vcc, 3, v51
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v49
-; SI-NEXT:    v_and_b32_e32 v63, 0xff, v63
+; SI-NEXT:    v_mov_b32_e32 v17, v43
+; SI-NEXT:    v_mov_b32_e32 v19, v44
+; SI-NEXT:    v_add_i32_e32 v47, vcc, 3, v19
+; SI-NEXT:    v_and_b32_e32 v47, 0xff, v47
+; SI-NEXT:    v_add_i32_e32 v57, vcc, 3, v15
+; SI-NEXT:    v_and_b32_e32 v57, 0xff, v57
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v51
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    s_movk_i32 s6, 0x300
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v25, v60, v2
+; SI-NEXT:    v_or_b32_e32 v23, v63, v2
+; SI-NEXT:    v_add_i32_e32 v63, vcc, 3, v54
+; SI-NEXT:    v_and_b32_e32 v63, 0xff, v63
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v27, v59, v2
+; SI-NEXT:    v_or_b32_e32 v25, v25, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v31, v58, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v46
+; SI-NEXT:    v_or_b32_e32 v31, v62, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v56
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v32, v45, v2
+; SI-NEXT:    v_or_b32_e32 v33, v61, v2
+; SI-NEXT:    v_add_i32_e32 v56, vcc, 3, v17
+; SI-NEXT:    v_and_b32_e32 v56, 0xff, v56
+; SI-NEXT:    v_add_i32_e32 v61, vcc, 3, v41
+; SI-NEXT:    v_and_b32_e32 v61, 0xff, v61
+; SI-NEXT:    v_add_i32_e32 v62, vcc, 3, v55
+; SI-NEXT:    v_and_b32_e32 v62, 0xff, v62
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v33, v44, v2
+; SI-NEXT:    v_or_b32_e32 v35, v60, v2
+; SI-NEXT:    v_add_i32_e32 v60, vcc, 3, v9
+; SI-NEXT:    v_and_b32_e32 v60, 0xff, v60
+; SI-NEXT:    v_or_b32_e32 v12, v12, v60
+; SI-NEXT:    v_add_i32_e32 v60, vcc, s6, v25
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v35, v43, v2
+; SI-NEXT:    v_or_b32_e32 v37, v59, v2
+; SI-NEXT:    v_add_i32_e32 v59, vcc, 3, v11
+; SI-NEXT:    v_and_b32_e32 v59, 0xff, v59
+; SI-NEXT:    v_or_b32_e32 v14, v14, v59
+; SI-NEXT:    v_add_i32_e32 v59, vcc, s6, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v37, v40, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v42
+; SI-NEXT:    v_or_b32_e32 v38, v58, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v46
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v38, v54, v2
+; SI-NEXT:    v_or_b32_e32 v39, v45, v2
+; SI-NEXT:    v_add_i32_e32 v46, vcc, 3, v21
+; SI-NEXT:    v_and_b32_e32 v46, 0xff, v46
+; SI-NEXT:    v_add_i32_e32 v58, vcc, 3, v13
+; SI-NEXT:    v_and_b32_e32 v58, 0xff, v58
+; SI-NEXT:    v_or_b32_e32 v32, v32, v58
+; SI-NEXT:    v_add_i32_e32 v58, vcc, s6, v33
+; SI-NEXT:    v_add_i32_e32 v9, vcc, s6, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v39, v0, v2
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v48, v0, v2
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v48, v0, v2
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v54, v0, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v53
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v42
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v55, v0, v2
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v41, v0, v2
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v52
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v52
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v50
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v50, v0, v2
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v19
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v19, v0, v2
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v17
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v17, v0, v2
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v15
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v15, v0, v2
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v36
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v40
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v53
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v50
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v2, v0, v2
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v27, vcc, 3, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v20
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 3, v34
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; SI-NEXT:    v_and_b32_e32 v27, 0xff, v27
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v2, v0, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v27
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v27, vcc, s6, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v16, v0, v16
+; SI-NEXT:    v_or_b32_e32 v0, v0, v29
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v16, vcc, s6, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v0, v0, v20
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v20, v0, v20
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v0, v0, v29
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v16, v0, v16
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v34
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v29, vcc, s6, v17
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v30
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v28
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v28, vcc, s6, v15
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v26
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v24
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v22
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v22, vcc, s6, v20
+; SI-NEXT:    v_add_i32_e32 v20, vcc, s6, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v18
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v18, vcc, s6, v2
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; SI-NEXT:    v_add_i32_e32 v36, vcc, 3, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v36, 0xff, v36
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v0, v0, v30
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v0, v0, v36
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v30, vcc, s6, v19
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v34, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v34, 0xff, v34
+; SI-NEXT:    v_add_i32_e32 v49, vcc, 3, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v49, 0xff, v49
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v0, v0, v34
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v0, v0, v49
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v36, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v36, 0xff, v36
+; SI-NEXT:    v_add_i32_e32 v50, vcc, 3, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v50, 0xff, v50
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v0, v0, v36
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v0, v0, v50
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v52, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v52, 0xff, v52
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v52, vcc, s6, v54
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v53, vcc, 3, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v53, 0xff, v53
-; SI-NEXT:    v_or_b32_e32 v0, v7, v53
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v53, vcc, s6, v48
-; SI-NEXT:    v_add_i32_e32 v48, vcc, s6, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v7, v7, v53
+; SI-NEXT:    v_add_i32_e32 v54, vcc, s6, v7
+; SI-NEXT:    v_add_i32_e32 v7, vcc, s6, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v40, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v40, 0xff, v40
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v40
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v40, vcc, s6, v39
-; SI-NEXT:    v_add_i32_e32 v39, vcc, s6, v41
-; SI-NEXT:    v_add_i32_e32 v41, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v55, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v42, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v42, 0xff, v42
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v54, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v42
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v42, vcc, s6, v38
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v51, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v43, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v43, 0xff, v43
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v43
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v43, vcc, s6, v37
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v44, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v44, 0xff, v44
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v44
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v44, vcc, s6, v35
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v45, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v45, 0xff, v45
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v45
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v45, vcc, s6, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v46, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v46, 0xff, v46
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v45, vcc, s6, v48
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v46
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v46, vcc, s6, v32
-; SI-NEXT:    v_add_i32_e32 v32, vcc, s6, v50
-; SI-NEXT:    v_add_i32_e32 v50, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v47, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v47, 0xff, v47
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v49, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v46, vcc, s6, v39
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v47, vcc, s6, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v56, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v56, 0xff, v56
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v47, vcc, s6, v38
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v56
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v56, vcc, s6, v27
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v57, vcc, 3, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v57, 0xff, v57
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v56, vcc, s6, v37
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v57
-; SI-NEXT:    v_add_i32_e32 v57, vcc, s6, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v27, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v31, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v58, vcc, 3, v0
-; SI-NEXT:    v_and_b32_e32 v58, 0xff, v58
-; SI-NEXT:    v_or_b32_e32 v0, v14, v58
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v58, vcc, s6, v23
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v23, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v59, vcc, 3, v0
-; SI-NEXT:    v_and_b32_e32 v59, 0xff, v59
-; SI-NEXT:    v_or_b32_e32 v0, v12, v59
-; SI-NEXT:    v_add_i32_e32 v59, vcc, s6, v21
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v21, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v19, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v60, vcc, 3, v0
-; SI-NEXT:    v_and_b32_e32 v60, 0xff, v60
-; SI-NEXT:    v_or_b32_e32 v0, v10, v60
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v60, vcc, s6, v13
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v15, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v61, vcc, 3, v0
-; SI-NEXT:    v_and_b32_e32 v61, 0xff, v61
 ; SI-NEXT:    v_or_b32_e32 v0, v8, v61
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v61, vcc, s6, v11
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v62, vcc, 3, v0
-; SI-NEXT:    v_and_b32_e32 v62, 0xff, v62
 ; SI-NEXT:    v_or_b32_e32 v0, v6, v62
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v57, vcc, s6, v35
+; SI-NEXT:    v_add_i32_e32 v61, vcc, s6, v23
 ; SI-NEXT:    v_add_i32_e32 v62, vcc, s6, v1
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, s6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v63, v0, v63
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v3
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v9
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_add_i32_e32 v41, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v55, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v51, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v25, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v31, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v44, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v23, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v43, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v42, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v40, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v53, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v52, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v50, vcc, s6, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v9, vcc, s6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, s6, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v62
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v60
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v58
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v15
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v17
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v19
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v21
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v56
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v59
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v27
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v61
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v49
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v50
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v51
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_add_i32_e32 v38, vcc, s6, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v49, vcc, s6, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v54
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v55
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v41
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_add_i32_e32 v37, vcc, s6, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v48, vcc, s6, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v39, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v38, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v37, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v36, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v35, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v33, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v26, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v30, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v24, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v29, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v28, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v20, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v26, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v14, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v24, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v18, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v16, vcc, s6, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, s6, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s6, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v2
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v3
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v6
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v8
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v12
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v14
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v16
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v18
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v20
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v22
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v24
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v26
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v27
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v28
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v46
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v56
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v30
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v44
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v57
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v46
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v34
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v42
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v59
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v36
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v44
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v57
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v53
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v42
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v45
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v47
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v53
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v49
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v50
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v40
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v45
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v21, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v19, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v15, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v13, vcc, s6, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v11, vcc, s6, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, s6, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v7
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v9
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v11
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v13
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v15
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v60
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v19
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v21
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v58
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v61
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v25
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v51
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v54
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v55
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v41
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_add_i32_e32 v32, vcc, s6, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v62
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v14, vcc, s6, v12
+; SI-NEXT:    v_add_i32_e32 v12, vcc, s6, v63
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v14
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v32
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v63, vcc, s6, v63
 ; SI-NEXT:    v_cvt_f32_f16_e32 v63, v63
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v0
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB92_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v0, v10, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v10
 ; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v10
 ; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180725,9 +180681,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180736,9 +180692,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180747,9 +180703,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180758,9 +180714,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180769,9 +180725,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180780,9 +180736,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180791,9 +180747,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180802,9 +180758,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180813,9 +180769,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180824,9 +180780,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180835,9 +180791,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180846,9 +180802,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180857,9 +180813,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180868,9 +180824,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180879,9 +180835,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180890,9 +180846,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180901,9 +180857,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180912,9 +180868,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180923,9 +180879,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -180934,27 +180890,38 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v10
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v5
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v5
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v5
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -180962,43 +180929,36 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v32
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v29
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v5
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v23
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v5
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v13
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v11
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v17
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x7c, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x7c, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
@@ -185214,7 +185174,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
 ; SI-NEXT:    ; implicit-def: $vgpr61 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_mov_b32 s10, s16
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_writelane_b32 v61, s29, 0
+; SI-NEXT:    v_writelane_b32 v61, s28, 1
+; SI-NEXT:    v_writelane_b32 v61, s27, 2
+; SI-NEXT:    s_mov_b32 s61, s21
 ; SI-NEXT:    v_writelane_b32 v63, s30, 0
 ; SI-NEXT:    v_writelane_b32 v63, s31, 1
 ; SI-NEXT:    v_writelane_b32 v63, s34, 2
@@ -185249,59 +185213,59 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_writelane_b32 v63, s87, 31
 ; SI-NEXT:    v_writelane_b32 v63, s96, 32
 ; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    s_mov_b32 s54, s27
-; SI-NEXT:    s_mov_b32 s79, s29
-; SI-NEXT:    s_mov_b32 s66, s26
-; SI-NEXT:    s_mov_b32 s64, s23
-; SI-NEXT:    s_mov_b32 s65, s19
-; SI-NEXT:    s_mov_b32 s67, s17
+; SI-NEXT:    s_mov_b32 s67, s19
+; SI-NEXT:    s_mov_b32 s54, s17
+; SI-NEXT:    s_mov_b32 s35, s23
+; SI-NEXT:    s_mov_b32 s39, s26
+; SI-NEXT:    s_mov_b32 s62, s25
 ; SI-NEXT:    v_writelane_b32 v63, s98, 34
 ; SI-NEXT:    v_writelane_b32 v63, s99, 35
-; SI-NEXT:    s_mov_b32 s92, s24
-; SI-NEXT:    v_readfirstlane_b32 s31, v1
-; SI-NEXT:    v_readfirstlane_b32 s81, v23
+; SI-NEXT:    v_readfirstlane_b32 s99, v1
+; SI-NEXT:    v_readfirstlane_b32 s74, v24
 ; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; SI-NEXT:    v_readfirstlane_b32 s12, v26
+; SI-NEXT:    v_readfirstlane_b32 s6, v23
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v62, s81, 0
+; SI-NEXT:    v_writelane_b32 v62, s74, 0
+; SI-NEXT:    v_readfirstlane_b32 s12, v26
+; SI-NEXT:    v_writelane_b32 v62, s6, 1
 ; SI-NEXT:    v_readfirstlane_b32 s14, v25
-; SI-NEXT:    v_writelane_b32 v62, s12, 1
+; SI-NEXT:    v_writelane_b32 v62, s12, 2
 ; SI-NEXT:    v_readfirstlane_b32 s46, v28
-; SI-NEXT:    v_writelane_b32 v62, s14, 2
-; SI-NEXT:    v_readfirstlane_b32 s57, v27
-; SI-NEXT:    v_writelane_b32 v62, s46, 3
-; SI-NEXT:    v_readfirstlane_b32 s58, v30
-; SI-NEXT:    v_writelane_b32 v62, s57, 4
-; SI-NEXT:    s_mov_b32 s77, s25
-; SI-NEXT:    v_readfirstlane_b32 s25, v29
-; SI-NEXT:    v_writelane_b32 v62, s58, 5
-; SI-NEXT:    v_writelane_b32 v62, s25, 6
-; SI-NEXT:    v_readfirstlane_b32 s55, v3
-; SI-NEXT:    v_readfirstlane_b32 s80, v5
-; SI-NEXT:    v_readfirstlane_b32 s51, v7
+; SI-NEXT:    v_writelane_b32 v62, s14, 3
+; SI-NEXT:    v_readfirstlane_b32 s56, v27
+; SI-NEXT:    v_writelane_b32 v62, s46, 4
+; SI-NEXT:    v_readfirstlane_b32 s57, v30
+; SI-NEXT:    v_writelane_b32 v62, s56, 5
+; SI-NEXT:    v_readfirstlane_b32 s59, v29
+; SI-NEXT:    v_writelane_b32 v62, s57, 6
+; SI-NEXT:    v_writelane_b32 v62, s59, 7
+; SI-NEXT:    s_mov_b32 s60, s20
+; SI-NEXT:    s_mov_b32 s63, s24
+; SI-NEXT:    v_readfirstlane_b32 s95, v3
+; SI-NEXT:    v_readfirstlane_b32 s31, v5
+; SI-NEXT:    v_readfirstlane_b32 s24, v9
+; SI-NEXT:    v_readfirstlane_b32 s38, v12
 ; SI-NEXT:    v_readfirstlane_b32 s36, v11
-; SI-NEXT:    v_readfirstlane_b32 s87, v16
-; SI-NEXT:    v_readfirstlane_b32 s84, v15
+; SI-NEXT:    v_readfirstlane_b32 s8, v14
+; SI-NEXT:    v_readfirstlane_b32 s27, v13
+; SI-NEXT:    v_readfirstlane_b32 s9, v16
+; SI-NEXT:    v_readfirstlane_b32 s79, v15
 ; SI-NEXT:    v_readfirstlane_b32 s13, v18
 ; SI-NEXT:    v_readfirstlane_b32 s15, v17
 ; SI-NEXT:    v_readfirstlane_b32 s42, v20
 ; SI-NEXT:    v_readfirstlane_b32 s43, v19
 ; SI-NEXT:    v_readfirstlane_b32 s44, v22
-; SI-NEXT:    v_readfirstlane_b32 s73, v21
-; SI-NEXT:    v_readfirstlane_b32 s74, v24
-; SI-NEXT:    v_readfirstlane_b32 s62, v14
-; SI-NEXT:    v_readfirstlane_b32 s9, v13
-; SI-NEXT:    v_readfirstlane_b32 s63, v12
-; SI-NEXT:    v_readfirstlane_b32 s61, v10
-; SI-NEXT:    v_readfirstlane_b32 s94, v9
-; SI-NEXT:    v_readfirstlane_b32 s60, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:328
-; SI-NEXT:    v_writelane_b32 v61, s4, 0
-; SI-NEXT:    v_readfirstlane_b32 s35, v6
-; SI-NEXT:    v_readfirstlane_b32 s91, v4
-; SI-NEXT:    v_readfirstlane_b32 s95, v2
+; SI-NEXT:    v_writelane_b32 v61, s4, 3
+; SI-NEXT:    v_readfirstlane_b32 s45, v21
+; SI-NEXT:    v_readfirstlane_b32 s98, v10
+; SI-NEXT:    v_readfirstlane_b32 s90, v8
+; SI-NEXT:    v_readfirstlane_b32 s88, v7
+; SI-NEXT:    v_readfirstlane_b32 s91, v6
+; SI-NEXT:    v_readfirstlane_b32 s93, v4
+; SI-NEXT:    v_readfirstlane_b32 s55, v2
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
@@ -185319,142 +185283,142 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:324
-; SI-NEXT:    v_writelane_b32 v61, s4, 1
+; SI-NEXT:    v_writelane_b32 v61, s4, 4
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:320
-; SI-NEXT:    v_writelane_b32 v61, s4, 2
+; SI-NEXT:    v_writelane_b32 v61, s4, 5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:316
-; SI-NEXT:    v_writelane_b32 v61, s4, 3
+; SI-NEXT:    v_writelane_b32 v61, s4, 6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:312
-; SI-NEXT:    v_writelane_b32 v61, s4, 4
+; SI-NEXT:    v_writelane_b32 v61, s4, 7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:308
-; SI-NEXT:    v_writelane_b32 v61, s4, 5
+; SI-NEXT:    v_writelane_b32 v61, s4, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:304
-; SI-NEXT:    v_writelane_b32 v61, s4, 6
+; SI-NEXT:    v_writelane_b32 v61, s4, 9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
-; SI-NEXT:    v_writelane_b32 v61, s4, 7
+; SI-NEXT:    v_writelane_b32 v61, s4, 10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:296
-; SI-NEXT:    v_writelane_b32 v61, s4, 8
+; SI-NEXT:    v_writelane_b32 v61, s4, 11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:292
-; SI-NEXT:    v_writelane_b32 v61, s4, 9
+; SI-NEXT:    v_writelane_b32 v61, s4, 12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:288
-; SI-NEXT:    v_writelane_b32 v61, s4, 10
+; SI-NEXT:    v_writelane_b32 v61, s4, 13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:284
-; SI-NEXT:    v_writelane_b32 v61, s4, 11
+; SI-NEXT:    v_writelane_b32 v61, s4, 14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:280
-; SI-NEXT:    v_writelane_b32 v61, s4, 12
+; SI-NEXT:    v_writelane_b32 v61, s4, 15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:276
-; SI-NEXT:    v_writelane_b32 v61, s4, 13
+; SI-NEXT:    v_writelane_b32 v61, s4, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:272
-; SI-NEXT:    v_writelane_b32 v61, s4, 14
+; SI-NEXT:    v_writelane_b32 v61, s4, 17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:268
-; SI-NEXT:    v_writelane_b32 v61, s4, 15
+; SI-NEXT:    v_writelane_b32 v61, s4, 18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:264
-; SI-NEXT:    v_writelane_b32 v61, s4, 16
+; SI-NEXT:    v_writelane_b32 v61, s4, 19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:260
-; SI-NEXT:    v_writelane_b32 v61, s4, 17
+; SI-NEXT:    v_writelane_b32 v61, s4, 20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:256
-; SI-NEXT:    v_writelane_b32 v61, s4, 18
+; SI-NEXT:    v_writelane_b32 v61, s4, 21
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:252
-; SI-NEXT:    v_writelane_b32 v61, s4, 19
+; SI-NEXT:    v_writelane_b32 v61, s4, 22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:248
-; SI-NEXT:    v_writelane_b32 v61, s4, 20
+; SI-NEXT:    v_writelane_b32 v61, s4, 23
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:244
-; SI-NEXT:    v_writelane_b32 v61, s4, 21
+; SI-NEXT:    v_writelane_b32 v61, s4, 24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:240
-; SI-NEXT:    v_writelane_b32 v61, s4, 22
+; SI-NEXT:    v_writelane_b32 v61, s4, 25
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:236
-; SI-NEXT:    v_writelane_b32 v61, s4, 23
+; SI-NEXT:    v_writelane_b32 v61, s4, 26
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:232
-; SI-NEXT:    v_writelane_b32 v61, s4, 24
+; SI-NEXT:    v_writelane_b32 v61, s4, 27
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:228
-; SI-NEXT:    v_writelane_b32 v61, s4, 25
+; SI-NEXT:    v_writelane_b32 v61, s4, 28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:224
-; SI-NEXT:    v_writelane_b32 v61, s4, 26
+; SI-NEXT:    v_writelane_b32 v61, s4, 29
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_writelane_b32 v61, s4, 27
+; SI-NEXT:    v_writelane_b32 v61, s4, 30
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s16, v31
+; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:216
+; SI-NEXT:    v_writelane_b32 v61, s4, 31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:212
-; SI-NEXT:    v_writelane_b32 v61, s4, 28
+; SI-NEXT:    v_writelane_b32 v61, s4, 32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s16, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:208
-; SI-NEXT:    v_writelane_b32 v61, s4, 29
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:204
-; SI-NEXT:    v_writelane_b32 v61, s4, 30
+; SI-NEXT:    v_writelane_b32 v61, s4, 33
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s89, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:200
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:196
-; SI-NEXT:    v_writelane_b32 v61, s4, 31
+; SI-NEXT:    v_writelane_b32 v61, s4, 34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s93, v31
+; SI-NEXT:    v_readfirstlane_b32 s73, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:192
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:188
-; SI-NEXT:    v_writelane_b32 v61, s4, 32
+; SI-NEXT:    v_writelane_b32 v61, s4, 35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_readfirstlane_b32 s72, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:184
@@ -185462,265 +185426,270 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_readfirstlane_b32 s40, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:180
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s97, v31
+; SI-NEXT:    v_readfirstlane_b32 s21, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:176
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s45, v31
+; SI-NEXT:    v_readfirstlane_b32 s85, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:172
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s85, v31
+; SI-NEXT:    v_readfirstlane_b32 s81, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:168
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s11, v31
+; SI-NEXT:    v_readfirstlane_b32 s97, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:164
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s7, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:160
-; SI-NEXT:    v_writelane_b32 v61, s4, 33
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s7, v31
+; SI-NEXT:    v_readfirstlane_b32 s11, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:156
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s47, v31
+; SI-NEXT:    v_readfirstlane_b32 s41, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:152
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s41, v31
+; SI-NEXT:    v_readfirstlane_b32 s47, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s59, v31
+; SI-NEXT:    v_readfirstlane_b32 s58, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:144
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s56, v31
+; SI-NEXT:    v_readfirstlane_b32 s76, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:140
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s78, v31
+; SI-NEXT:    v_readfirstlane_b32 s29, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:136
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s27, v31
+; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132
+; SI-NEXT:    v_writelane_b32 v61, s4, 36
+; SI-NEXT:    v_writelane_b32 v61, s54, 37
+; SI-NEXT:    v_writelane_b32 v61, s10, 38
+; SI-NEXT:    v_writelane_b32 v61, s67, 39
+; SI-NEXT:    v_writelane_b32 v61, s18, 40
+; SI-NEXT:    v_writelane_b32 v61, s61, 41
+; SI-NEXT:    v_writelane_b32 v61, s60, 42
+; SI-NEXT:    v_writelane_b32 v61, s35, 43
+; SI-NEXT:    v_writelane_b32 v61, s22, 44
+; SI-NEXT:    v_writelane_b32 v61, s62, 45
+; SI-NEXT:    v_writelane_b32 v61, s63, 46
+; SI-NEXT:    v_writelane_b32 v61, s39, 47
+; SI-NEXT:    v_writelane_b32 v61, s99, 48
+; SI-NEXT:    v_writelane_b32 v61, s95, 49
+; SI-NEXT:    v_writelane_b32 v61, s31, 50
+; SI-NEXT:    v_writelane_b32 v61, s24, 51
+; SI-NEXT:    v_writelane_b32 v61, s38, 52
+; SI-NEXT:    v_writelane_b32 v61, s36, 53
+; SI-NEXT:    v_writelane_b32 v61, s8, 54
+; SI-NEXT:    v_writelane_b32 v61, s27, 55
+; SI-NEXT:    v_writelane_b32 v61, s9, 56
+; SI-NEXT:    v_writelane_b32 v61, s79, 57
+; SI-NEXT:    v_writelane_b32 v61, s13, 58
+; SI-NEXT:    v_writelane_b32 v61, s15, 59
+; SI-NEXT:    v_writelane_b32 v61, s42, 60
+; SI-NEXT:    v_writelane_b32 v61, s43, 61
+; SI-NEXT:    v_writelane_b32 v61, s44, 62
+; SI-NEXT:    v_writelane_b32 v61, s45, 63
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s39, v31
+; SI-NEXT:    v_readfirstlane_b32 s37, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s53, v31
+; SI-NEXT:    v_readfirstlane_b32 s50, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s50, v31
+; SI-NEXT:    v_readfirstlane_b32 s48, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s19, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
-; SI-NEXT:    v_writelane_b32 v61, s4, 34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s29, v31
+; SI-NEXT:    v_readfirstlane_b32 s64, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s26, v31
+; SI-NEXT:    v_readfirstlane_b32 s17, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s23, v31
+; SI-NEXT:    v_readfirstlane_b32 s65, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s71, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
-; SI-NEXT:    v_writelane_b32 v61, s4, 35
-; SI-NEXT:    v_writelane_b32 v61, s67, 36
-; SI-NEXT:    v_writelane_b32 v61, s10, 37
-; SI-NEXT:    v_writelane_b32 v61, s65, 38
-; SI-NEXT:    v_writelane_b32 v61, s18, 39
-; SI-NEXT:    v_writelane_b32 v61, s21, 40
-; SI-NEXT:    v_writelane_b32 v61, s20, 41
-; SI-NEXT:    v_writelane_b32 v61, s64, 42
-; SI-NEXT:    v_writelane_b32 v61, s22, 43
-; SI-NEXT:    v_writelane_b32 v61, s77, 44
-; SI-NEXT:    v_writelane_b32 v61, s92, 45
-; SI-NEXT:    v_writelane_b32 v61, s54, 46
-; SI-NEXT:    v_writelane_b32 v61, s66, 47
-; SI-NEXT:    v_writelane_b32 v61, s79, 48
-; SI-NEXT:    v_writelane_b32 v61, s31, 49
-; SI-NEXT:    v_writelane_b32 v61, s28, 50
-; SI-NEXT:    v_writelane_b32 v61, s55, 51
-; SI-NEXT:    v_writelane_b32 v61, s80, 52
-; SI-NEXT:    v_writelane_b32 v61, s51, 53
-; SI-NEXT:    v_writelane_b32 v61, s36, 54
-; SI-NEXT:    v_writelane_b32 v61, s87, 55
-; SI-NEXT:    v_writelane_b32 v61, s84, 56
-; SI-NEXT:    v_writelane_b32 v61, s13, 57
-; SI-NEXT:    v_writelane_b32 v61, s15, 58
-; SI-NEXT:    v_writelane_b32 v61, s42, 59
-; SI-NEXT:    v_writelane_b32 v61, s43, 60
-; SI-NEXT:    v_writelane_b32 v61, s44, 61
-; SI-NEXT:    v_writelane_b32 v61, s73, 62
-; SI-NEXT:    v_writelane_b32 v61, s74, 63
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s19, v31
+; SI-NEXT:    v_readfirstlane_b32 s70, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s71, v31
+; SI-NEXT:    v_readfirstlane_b32 s83, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s17, v31
+; SI-NEXT:    v_readfirstlane_b32 s49, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s70, v31
+; SI-NEXT:    v_readfirstlane_b32 s80, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s37, v31
+; SI-NEXT:    v_readfirstlane_b32 s82, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s82, v31
+; SI-NEXT:    v_readfirstlane_b32 s87, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s83, v31
+; SI-NEXT:    v_readfirstlane_b32 s84, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s86, v31
+; SI-NEXT:    v_readfirstlane_b32 s51, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s30, v31
+; SI-NEXT:    v_readfirstlane_b32 s86, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s96, v31
+; SI-NEXT:    v_readfirstlane_b32 s94, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s48, v31
+; SI-NEXT:    v_readfirstlane_b32 s96, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s98, v31
+; SI-NEXT:    v_readfirstlane_b32 s68, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s38, v31
+; SI-NEXT:    v_readfirstlane_b32 s34, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s68, v31
+; SI-NEXT:    v_readfirstlane_b32 s77, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s99, v31
+; SI-NEXT:    v_readfirstlane_b32 s66, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s69, v31
+; SI-NEXT:    v_readfirstlane_b32 s78, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s49, v31
+; SI-NEXT:    v_readfirstlane_b32 s53, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s6, v31
+; SI-NEXT:    v_readfirstlane_b32 s69, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s90, v31
+; SI-NEXT:    v_readfirstlane_b32 s30, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s34, v31
+; SI-NEXT:    v_readfirstlane_b32 s52, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s52, v31
+; SI-NEXT:    v_readfirstlane_b32 s75, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s88, v31
+; SI-NEXT:    v_readfirstlane_b32 s23, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s8, v31
+; SI-NEXT:    v_readfirstlane_b32 s28, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s24, v31
+; SI-NEXT:    v_readfirstlane_b32 s26, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s76, v31
+; SI-NEXT:    v_readfirstlane_b32 s25, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    v_writelane_b32 v62, s76, 7
-; SI-NEXT:    v_writelane_b32 v62, s8, 8
+; SI-NEXT:    v_writelane_b32 v62, s25, 8
+; SI-NEXT:    v_writelane_b32 v62, s28, 9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_readfirstlane_b32 s75, v31
-; SI-NEXT:    v_writelane_b32 v62, s75, 9
-; SI-NEXT:    v_writelane_b32 v62, s52, 10
-; SI-NEXT:    v_writelane_b32 v62, s24, 11
-; SI-NEXT:    v_writelane_b32 v62, s90, 12
-; SI-NEXT:    v_writelane_b32 v62, s88, 13
-; SI-NEXT:    v_writelane_b32 v62, s34, 14
-; SI-NEXT:    v_writelane_b32 v62, s17, 15
-; SI-NEXT:    v_writelane_b32 v62, s71, 16
-; SI-NEXT:    v_writelane_b32 v62, s70, 17
-; SI-NEXT:    v_writelane_b32 v62, s37, 18
-; SI-NEXT:    v_writelane_b32 v62, s83, 19
-; SI-NEXT:    v_writelane_b32 v62, s82, 20
-; SI-NEXT:    v_writelane_b32 v62, s30, 21
-; SI-NEXT:    v_writelane_b32 v62, s86, 22
-; SI-NEXT:    v_writelane_b32 v62, s48, 23
-; SI-NEXT:    v_writelane_b32 v62, s38, 24
-; SI-NEXT:    v_writelane_b32 v62, s96, 25
-; SI-NEXT:    v_writelane_b32 v62, s49, 26
-; SI-NEXT:    v_writelane_b32 v62, s99, 27
-; SI-NEXT:    v_writelane_b32 v62, s98, 28
-; SI-NEXT:    v_writelane_b32 v62, s6, 29
-; SI-NEXT:    v_writelane_b32 v62, s68, 30
-; SI-NEXT:    v_writelane_b32 v62, s69, 31
-; SI-NEXT:    s_cbranch_scc0 .LBB93_2
+; SI-NEXT:    v_readfirstlane_b32 s92, v31
+; SI-NEXT:    v_writelane_b32 v62, s92, 10
+; SI-NEXT:    v_writelane_b32 v62, s75, 11
+; SI-NEXT:    v_writelane_b32 v62, s26, 12
+; SI-NEXT:    v_writelane_b32 v62, s30, 13
+; SI-NEXT:    v_writelane_b32 v62, s23, 14
+; SI-NEXT:    v_writelane_b32 v62, s52, 15
+; SI-NEXT:    v_writelane_b32 v62, s64, 16
+; SI-NEXT:    v_writelane_b32 v62, s17, 17
+; SI-NEXT:    v_writelane_b32 v62, s65, 18
+; SI-NEXT:    v_writelane_b32 v62, s70, 19
+; SI-NEXT:    v_writelane_b32 v62, s71, 20
+; SI-NEXT:    v_writelane_b32 v62, s49, 21
+; SI-NEXT:    v_writelane_b32 v62, s83, 22
+; SI-NEXT:    v_writelane_b32 v62, s80, 23
+; SI-NEXT:    v_writelane_b32 v62, s82, 24
+; SI-NEXT:    v_writelane_b32 v62, s84, 25
+; SI-NEXT:    v_writelane_b32 v62, s87, 26
+; SI-NEXT:    v_writelane_b32 v62, s86, 27
+; SI-NEXT:    v_writelane_b32 v62, s51, 28
+; SI-NEXT:    v_writelane_b32 v62, s96, 29
+; SI-NEXT:    v_writelane_b32 v62, s34, 30
+; SI-NEXT:    v_writelane_b32 v62, s94, 31
+; SI-NEXT:    v_writelane_b32 v62, s53, 32
+; SI-NEXT:    v_writelane_b32 v62, s66, 33
+; SI-NEXT:    v_writelane_b32 v62, s68, 34
+; SI-NEXT:    v_writelane_b32 v62, s69, 35
+; SI-NEXT:    v_writelane_b32 v62, s77, 36
+; SI-NEXT:    v_writelane_b32 v62, s78, 37
+; SI-NEXT:    s_cbranch_scc0 .LBB93_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s10, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s67, 8
+; SI-NEXT:    s_lshl_b32 s5, s54, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
 ; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s65, 8
+; SI-NEXT:    s_lshl_b32 s5, s67, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_and_b32 s4, s60, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s61, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
 ; SI-NEXT:    s_and_b32 s4, s22, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s64, 8
+; SI-NEXT:    s_lshl_b32 s5, s35, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT:    s_and_b32 s4, s92, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s77, 8
+; SI-NEXT:    s_and_b32 s4, s63, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s62, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT:    s_and_b32 s4, s66, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s54, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s79, 8
+; SI-NEXT:    v_readlane_b32 s5, v61, 2
+; SI-NEXT:    s_and_b32 s4, s39, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT:    s_and_b32 s4, s31, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s95, 8
+; SI-NEXT:    v_readlane_b32 s4, v61, 1
+; SI-NEXT:    v_readlane_b32 s5, v61, 0
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s4
+; SI-NEXT:    s_and_b32 s4, s99, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s55, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s55, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s91, 8
+; SI-NEXT:    s_and_b32 s4, s95, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s93, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s80, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s35, 8
+; SI-NEXT:    s_and_b32 s4, s31, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s91, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s51, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s60, 8
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, s4
+; SI-NEXT:    s_and_b32 s4, s88, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s90, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s94, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s61, 8
+; SI-NEXT:    s_and_b32 s4, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s98, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, s4
 ; SI-NEXT:    s_and_b32 s4, s36, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s63, 8
+; SI-NEXT:    s_lshl_b32 s5, s38, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s62, 8
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, s4
+; SI-NEXT:    s_and_b32 s4, s27, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s8, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, s4
-; SI-NEXT:    s_and_b32 s4, s84, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s87, 8
+; SI-NEXT:    s_and_b32 s4, s79, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s9, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, s4
 ; SI-NEXT:    s_and_b32 s4, s15, 0xff
@@ -185731,11 +185700,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s5, s42, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, s4
-; SI-NEXT:    s_and_b32 s4, s73, 0xff
+; SI-NEXT:    s_and_b32 s4, s45, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s44, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, s4
-; SI-NEXT:    s_and_b32 s4, s81, 0xff
+; SI-NEXT:    s_and_b32 s4, s6, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s74, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, s4
@@ -185743,393 +185712,244 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_lshl_b32 s5, s12, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, s4
-; SI-NEXT:    s_and_b32 s4, s57, 0xff
+; SI-NEXT:    s_and_b32 s4, s56, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s46, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, s4
-; SI-NEXT:    s_and_b32 s4, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s58, 8
+; SI-NEXT:    s_and_b32 s4, s59, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s57, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v23, s4
-; SI-NEXT:    s_and_b32 s4, s75, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s76, 8
+; SI-NEXT:    s_and_b32 s4, s92, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s25, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s8, 8
+; SI-NEXT:    s_and_b32 s4, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s28, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, s4
-; SI-NEXT:    s_and_b32 s4, s88, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s52, 8
+; SI-NEXT:    s_and_b32 s4, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s75, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, s4
-; SI-NEXT:    s_and_b32 s4, s34, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s90, 8
+; SI-NEXT:    s_and_b32 s4, s52, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s30, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, s4
-; SI-NEXT:    s_and_b32 s4, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s49, 8
+; SI-NEXT:    s_and_b32 s4, s69, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s53, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v26, s4
-; SI-NEXT:    s_and_b32 s4, s69, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s99, 8
+; SI-NEXT:    s_and_b32 s4, s78, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s66, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, s4
-; SI-NEXT:    s_and_b32 s4, s68, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s38, 8
+; SI-NEXT:    s_and_b32 s4, s77, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s34, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v28, s4
-; SI-NEXT:    s_and_b32 s4, s98, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s48, 8
+; SI-NEXT:    s_and_b32 s4, s68, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s96, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, s4
-; SI-NEXT:    s_and_b32 s4, s96, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s30, 8
+; SI-NEXT:    s_and_b32 s4, s94, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s86, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, s4
-; SI-NEXT:    s_and_b32 s4, s86, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s83, 8
+; SI-NEXT:    s_and_b32 s4, s51, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s84, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s4
-; SI-NEXT:    s_and_b32 s4, s82, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s37, 8
+; SI-NEXT:    s_and_b32 s4, s87, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s82, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, s4
-; SI-NEXT:    s_and_b32 s4, s70, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s17, 8
+; SI-NEXT:    s_and_b32 s4, s80, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s49, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, s4
-; SI-NEXT:    s_and_b32 s4, s71, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s19, 8
+; SI-NEXT:    s_and_b32 s4, s83, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s70, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s13, v61, 35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s23, 8
+; SI-NEXT:    s_and_b32 s4, s71, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s65, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s29, 8
-; SI-NEXT:    s_mov_b32 s79, s9
+; SI-NEXT:    s_and_b32 s4, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s64, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s9, v61, 34
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s50, 8
+; SI-NEXT:    s_and_b32 s4, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s48, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, s4
-; SI-NEXT:    s_and_b32 s4, s53, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s39, 8
+; SI-NEXT:    s_and_b32 s4, s50, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s37, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    v_readlane_b32 s8, v61, 36
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, s4
-; SI-NEXT:    s_and_b32 s4, s27, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s78, 8
+; SI-NEXT:    s_and_b32 s4, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s29, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s4
-; SI-NEXT:    s_and_b32 s4, s56, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s59, 8
+; SI-NEXT:    s_and_b32 s4, s76, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s58, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s47, 8
+; SI-NEXT:    s_and_b32 s4, s47, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s41, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s21, v61, 33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_and_b32 s4, s11, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s7, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s85, 8
+; SI-NEXT:    s_and_b32 s4, s97, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s81, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, s4
-; SI-NEXT:    s_and_b32 s4, s45, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s97, 8
+; SI-NEXT:    s_and_b32 s4, s85, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s21, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, s4
 ; SI-NEXT:    s_and_b32 s4, s40, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s72, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s69, v61, 32
+; SI-NEXT:    v_readlane_b32 s69, v61, 35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, s4
 ; SI-NEXT:    s_and_b32 s4, s69, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s93, 8
+; SI-NEXT:    s_lshl_b32 s5, s73, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s68, v61, 31
+; SI-NEXT:    v_readlane_b32 s68, v61, 34
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, s4
 ; SI-NEXT:    s_and_b32 s4, s68, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s89, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s66, v61, 30
-; SI-NEXT:    v_readlane_b32 s20, v61, 29
+; SI-NEXT:    v_readlane_b32 s66, v61, 33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, s4
 ; SI-NEXT:    s_and_b32 s4, s66, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s20, 8
-; SI-NEXT:    s_mov_b32 s17, s19
-; SI-NEXT:    s_mov_b32 s19, s23
-; SI-NEXT:    s_mov_b32 s23, s26
-; SI-NEXT:    s_mov_b32 s26, s29
-; SI-NEXT:    s_mov_b32 s29, s53
+; SI-NEXT:    s_lshl_b32 s5, s16, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s53, v61, 28
+; SI-NEXT:    v_readlane_b32 s53, v61, 32
+; SI-NEXT:    v_readlane_b32 s94, v61, 31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, s4
 ; SI-NEXT:    s_and_b32 s4, s53, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s16, 8
+; SI-NEXT:    s_lshl_b32 s5, s94, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s34, v61, 27
-; SI-NEXT:    v_readlane_b32 s6, v61, 26
+; SI-NEXT:    v_readlane_b32 s34, v61, 30
+; SI-NEXT:    v_readlane_b32 s96, v61, 29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, s4
 ; SI-NEXT:    s_and_b32 s4, s34, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s6, 8
+; SI-NEXT:    s_lshl_b32 s5, s96, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s98, v61, 25
-; SI-NEXT:    v_readlane_b32 s99, v61, 24
+; SI-NEXT:    v_readlane_b32 s51, v61, 28
+; SI-NEXT:    v_readlane_b32 s86, v61, 27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, s4
-; SI-NEXT:    s_and_b32 s4, s98, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s99, 8
+; SI-NEXT:    s_and_b32 s4, s51, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s86, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s49, v61, 23
-; SI-NEXT:    v_readlane_b32 s96, v61, 22
+; SI-NEXT:    v_readlane_b32 s87, v61, 26
+; SI-NEXT:    v_readlane_b32 s84, v61, 25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, s4
-; SI-NEXT:    s_and_b32 s4, s49, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s96, 8
+; SI-NEXT:    s_and_b32 s4, s87, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s84, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s38, v61, 21
-; SI-NEXT:    v_readlane_b32 s48, v61, 20
+; SI-NEXT:    v_readlane_b32 s82, v61, 24
+; SI-NEXT:    v_readlane_b32 s80, v61, 23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, s4
-; SI-NEXT:    s_and_b32 s4, s38, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s48, 8
+; SI-NEXT:    s_and_b32 s4, s82, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s80, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s86, v61, 19
-; SI-NEXT:    v_readlane_b32 s30, v61, 18
+; SI-NEXT:    v_readlane_b32 s83, v61, 22
+; SI-NEXT:    v_readlane_b32 s49, v61, 21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, s4
-; SI-NEXT:    s_and_b32 s4, s86, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s30, 8
+; SI-NEXT:    s_and_b32 s4, s83, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s49, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s82, v61, 17
-; SI-NEXT:    v_readlane_b32 s83, v61, 16
+; SI-NEXT:    v_readlane_b32 s71, v61, 20
+; SI-NEXT:    v_readlane_b32 s70, v61, 19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v46, s4
-; SI-NEXT:    s_and_b32 s4, s82, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s83, 8
+; SI-NEXT:    s_and_b32 s4, s71, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s70, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s37, v61, 15
-; SI-NEXT:    v_readlane_b32 s70, v61, 14
+; SI-NEXT:    v_readlane_b32 s65, v61, 18
+; SI-NEXT:    v_readlane_b32 s54, v61, 17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, s4
-; SI-NEXT:    s_and_b32 s4, s37, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s70, 8
+; SI-NEXT:    s_and_b32 s4, s65, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s54, 8
+; SI-NEXT:    s_mov_b32 s17, s19
+; SI-NEXT:    s_mov_b32 s19, s50
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s71, v61, 13
-; SI-NEXT:    v_readlane_b32 s67, v61, 12
+; SI-NEXT:    v_readlane_b32 s67, v61, 16
+; SI-NEXT:    v_readlane_b32 s50, v61, 15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, s4
-; SI-NEXT:    s_and_b32 s4, s71, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s67, 8
+; SI-NEXT:    s_and_b32 s4, s67, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s50, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s65, v61, 11
-; SI-NEXT:    v_readlane_b32 s64, v61, 10
+; SI-NEXT:    v_readlane_b32 s64, v61, 14
+; SI-NEXT:    v_readlane_b32 s52, v61, 13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, s4
-; SI-NEXT:    s_and_b32 s4, s65, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s64, 8
-; SI-NEXT:    s_mov_b32 s88, s50
+; SI-NEXT:    s_and_b32 s4, s64, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s52, 8
+; SI-NEXT:    s_mov_b32 s23, s48
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s54, v61, 9
-; SI-NEXT:    v_readlane_b32 s50, v61, 8
+; SI-NEXT:    v_readlane_b32 s35, v61, 12
+; SI-NEXT:    v_readlane_b32 s48, v61, 11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, s4
-; SI-NEXT:    s_and_b32 s4, s54, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s50, 8
-; SI-NEXT:    s_mov_b32 s24, s39
+; SI-NEXT:    s_and_b32 s4, s35, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s48, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s90, v61, 7
-; SI-NEXT:    v_readlane_b32 s39, v61, 6
+; SI-NEXT:    v_readlane_b32 s30, v61, 10
+; SI-NEXT:    v_readlane_b32 s39, v61, 9
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT:    s_and_b32 s4, s90, 0xff
+; SI-NEXT:    s_and_b32 s4, s30, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s39, 8
+; SI-NEXT:    s_mov_b32 s26, s37
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s52, v61, 5
-; SI-NEXT:    v_readlane_b32 s75, v61, 4
+; SI-NEXT:    v_readlane_b32 s37, v61, 8
+; SI-NEXT:    v_readlane_b32 s75, v61, 7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, s4
-; SI-NEXT:    s_and_b32 s4, s52, 0xff
+; SI-NEXT:    s_and_b32 s4, s37, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s75, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s92, v61, 3
-; SI-NEXT:    v_readlane_b32 s77, v61, 2
+; SI-NEXT:    v_readlane_b32 s92, v61, 6
+; SI-NEXT:    v_readlane_b32 s77, v61, 5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
 ; SI-NEXT:    s_and_b32 s4, s92, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s77, 8
-; SI-NEXT:    s_mov_b32 s8, s27
-; SI-NEXT:    s_mov_b32 s27, s78
+; SI-NEXT:    s_mov_b32 s28, s29
+; SI-NEXT:    s_mov_b32 s29, s76
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s78, v61, 1
-; SI-NEXT:    v_readlane_b32 s76, v61, 0
+; SI-NEXT:    v_readlane_b32 s78, v61, 4
+; SI-NEXT:    v_readlane_b32 s76, v61, 3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
 ; SI-NEXT:    s_and_b32 s4, s78, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s76, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_mov_b32 s31, s95
-; SI-NEXT:    s_mov_b32 s57, s56
-; SI-NEXT:    s_mov_b32 s25, s59
-; SI-NEXT:    s_mov_b32 s14, s41
-; SI-NEXT:    s_mov_b32 s46, s47
-; SI-NEXT:    s_mov_b32 s12, s7
-; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    s_mov_b32 s11, s85
-; SI-NEXT:    s_mov_b32 s81, s45
-; SI-NEXT:    s_mov_b32 s85, s97
-; SI-NEXT:    s_mov_b32 s45, s40
+; SI-NEXT:    s_mov_b32 s99, s55
+; SI-NEXT:    s_mov_b32 s20, s88
+; SI-NEXT:    s_mov_b32 s24, s98
+; SI-NEXT:    s_mov_b32 s59, s58
+; SI-NEXT:    s_mov_b32 s56, s47
+; SI-NEXT:    s_mov_b32 s46, s41
+; SI-NEXT:    s_mov_b32 s12, s11
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s7, s97
+; SI-NEXT:    s_mov_b32 s97, s81
+; SI-NEXT:    s_mov_b32 s81, s85
+; SI-NEXT:    s_mov_b32 s6, s40
 ; SI-NEXT:    s_mov_b32 s40, s72
-; SI-NEXT:    s_mov_b32 s44, s93
+; SI-NEXT:    s_mov_b32 s45, s73
 ; SI-NEXT:    s_mov_b32 s15, s89
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    s_mov_b32 s95, s91
-; SI-NEXT:    s_mov_b32 s55, s35
-; SI-NEXT:    s_mov_b32 s80, s60
-; SI-NEXT:    s_mov_b32 s91, s61
-; SI-NEXT:    s_mov_b32 s51, s63
-; SI-NEXT:    s_mov_b32 s36, s62
-; SI-NEXT:    s_branch .LBB93_3
-; SI-NEXT:  .LBB93_2:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    s_mov_b32 s17, s19
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    s_mov_b32 s19, s23
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    s_mov_b32 s23, s26
-; SI-NEXT:    s_mov_b32 s26, s29
-; SI-NEXT:    s_mov_b32 s29, s53
-; SI-NEXT:    s_mov_b32 s88, s50
-; SI-NEXT:    s_mov_b32 s24, s39
-; SI-NEXT:    s_mov_b32 s8, s27
-; SI-NEXT:    s_mov_b32 s27, s78
-; SI-NEXT:    s_mov_b32 s25, s59
-; SI-NEXT:    s_mov_b32 s57, s56
-; SI-NEXT:    s_mov_b32 s46, s47
-; SI-NEXT:    s_mov_b32 s14, s41
-; SI-NEXT:    s_mov_b32 s12, s7
-; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    s_mov_b32 s11, s85
-; SI-NEXT:    s_mov_b32 s85, s97
-; SI-NEXT:    s_mov_b32 s81, s45
-; SI-NEXT:    s_mov_b32 s45, s40
-; SI-NEXT:    s_mov_b32 s40, s72
-; SI-NEXT:    s_mov_b32 s44, s93
-; SI-NEXT:    s_mov_b32 s15, s89
-; SI-NEXT:    s_mov_b32 s79, s9
-; SI-NEXT:    s_mov_b32 s31, s95
-; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_readlane_b32 s75, v61, 4
-; SI-NEXT:    v_readlane_b32 s76, v61, 0
-; SI-NEXT:    v_readlane_b32 s77, v61, 2
-; SI-NEXT:    v_readlane_b32 s78, v61, 1
-; SI-NEXT:    v_readlane_b32 s92, v61, 3
-; SI-NEXT:    v_readlane_b32 s39, v61, 6
-; SI-NEXT:    v_readlane_b32 s52, v61, 5
-; SI-NEXT:    v_readlane_b32 s90, v61, 7
-; SI-NEXT:    v_readlane_b32 s50, v61, 8
-; SI-NEXT:    v_readlane_b32 s64, v61, 10
-; SI-NEXT:    v_readlane_b32 s54, v61, 9
-; SI-NEXT:    v_readlane_b32 s67, v61, 12
-; SI-NEXT:    v_readlane_b32 s65, v61, 11
-; SI-NEXT:    v_readlane_b32 s70, v61, 14
-; SI-NEXT:    v_readlane_b32 s71, v61, 13
-; SI-NEXT:    v_readlane_b32 s37, v61, 15
-; SI-NEXT:    v_readlane_b32 s83, v61, 16
-; SI-NEXT:    v_readlane_b32 s30, v61, 18
-; SI-NEXT:    v_readlane_b32 s82, v61, 17
-; SI-NEXT:    v_readlane_b32 s48, v61, 20
-; SI-NEXT:    v_readlane_b32 s86, v61, 19
-; SI-NEXT:    v_readlane_b32 s96, v61, 22
-; SI-NEXT:    v_readlane_b32 s38, v61, 21
-; SI-NEXT:    v_readlane_b32 s49, v61, 23
-; SI-NEXT:    v_readlane_b32 s99, v61, 24
-; SI-NEXT:    v_readlane_b32 s6, v61, 26
-; SI-NEXT:    v_readlane_b32 s98, v61, 25
+; SI-NEXT:    s_mov_b32 s55, s93
 ; SI-NEXT:    s_mov_b32 s95, s91
-; SI-NEXT:    s_mov_b32 s55, s35
-; SI-NEXT:    s_mov_b32 s80, s60
-; SI-NEXT:    v_readlane_b32 s20, v61, 29
-; SI-NEXT:    s_mov_b32 s91, s61
-; SI-NEXT:    s_mov_b32 s51, s63
-; SI-NEXT:    s_mov_b32 s36, s62
-; SI-NEXT:    v_readlane_b32 s34, v61, 27
-; SI-NEXT:    v_readlane_b32 s53, v61, 28
-; SI-NEXT:    v_readlane_b32 s66, v61, 30
-; SI-NEXT:    v_readlane_b32 s68, v61, 31
-; SI-NEXT:    v_readlane_b32 s69, v61, 32
-; SI-NEXT:    v_readlane_b32 s21, v61, 33
-; SI-NEXT:    v_readlane_b32 s9, v61, 34
-; SI-NEXT:    v_readlane_b32 s13, v61, 35
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:  .LBB93_3: ; %Flow
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT:    s_mov_b32 s35, s79
-; SI-NEXT:    s_cbranch_vccnz .LBB93_5
-; SI-NEXT:  ; %bb.4: ; %cmp.true
+; SI-NEXT:    s_mov_b32 s31, s90
+; SI-NEXT:    s_cbranch_execnz .LBB93_3
+; SI-NEXT:  .LBB93_2: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s4, s78, 3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s76, 8
@@ -186138,60 +185958,60 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_and_b32 s5, s5, 0xff
 ; SI-NEXT:    s_lshl_b32 vcc_lo, s77, 8
 ; SI-NEXT:    s_or_b32 s5, vcc_lo, s5
-; SI-NEXT:    s_add_i32 vcc_lo, s52, 3
+; SI-NEXT:    s_add_i32 vcc_lo, s37, 3
 ; SI-NEXT:    s_and_b32 vcc_lo, vcc_lo, 0xff
 ; SI-NEXT:    s_lshl_b32 vcc_hi, s75, 8
 ; SI-NEXT:    s_or_b32 vcc_lo, vcc_hi, vcc_lo
-; SI-NEXT:    s_add_i32 vcc_hi, s90, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s30, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
 ; SI-NEXT:    s_lshl_b32 s60, s39, 8
 ; SI-NEXT:    s_or_b32 s60, s60, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s54, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s35, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s61, s50, 8
+; SI-NEXT:    s_lshl_b32 s61, s48, 8
 ; SI-NEXT:    s_or_b32 s61, s61, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s65, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s64, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s62, s64, 8
+; SI-NEXT:    s_lshl_b32 s62, s52, 8
 ; SI-NEXT:    s_or_b32 s62, s62, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s71, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s67, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s63, s67, 8
+; SI-NEXT:    s_lshl_b32 s63, s50, 8
 ; SI-NEXT:    s_or_b32 s10, s63, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s37, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s65, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s72, s70, 8
+; SI-NEXT:    s_lshl_b32 s72, s54, 8
 ; SI-NEXT:    s_or_b32 s72, s72, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s82, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s71, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s73, s83, 8
+; SI-NEXT:    s_lshl_b32 s73, s70, 8
 ; SI-NEXT:    s_or_b32 s73, s73, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s86, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s83, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s74, s30, 8
+; SI-NEXT:    s_lshl_b32 s74, s49, 8
 ; SI-NEXT:    s_or_b32 s74, s74, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s38, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s82, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s75, s48, 8
+; SI-NEXT:    s_lshl_b32 s75, s80, 8
 ; SI-NEXT:    s_or_b32 s75, s75, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s49, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s87, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s76, s96, 8
+; SI-NEXT:    s_lshl_b32 s76, s84, 8
 ; SI-NEXT:    s_or_b32 s76, s76, vcc_hi
-; SI-NEXT:    s_add_i32 vcc_hi, s98, 3
+; SI-NEXT:    s_add_i32 vcc_hi, s51, 3
 ; SI-NEXT:    s_add_i32 s93, s53, 3
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
-; SI-NEXT:    s_lshl_b32 s77, s99, 8
+; SI-NEXT:    s_lshl_b32 s77, s86, 8
 ; SI-NEXT:    s_add_i32 s89, s34, 3
 ; SI-NEXT:    s_and_b32 s93, s93, 0xff
-; SI-NEXT:    s_lshl_b32 s78, s16, 8
+; SI-NEXT:    s_lshl_b32 s78, s94, 8
 ; SI-NEXT:    s_add_i32 s34, s66, 3
 ; SI-NEXT:    s_or_b32 s77, s77, vcc_hi
 ; SI-NEXT:    s_and_b32 s89, s89, 0xff
-; SI-NEXT:    s_lshl_b32 vcc_hi, s6, 8
+; SI-NEXT:    s_lshl_b32 vcc_hi, s96, 8
 ; SI-NEXT:    s_or_b32 s22, s78, s93
 ; SI-NEXT:    s_and_b32 s93, s34, 0xff
-; SI-NEXT:    s_lshl_b32 s92, s20, 8
+; SI-NEXT:    s_lshl_b32 s92, s16, 8
 ; SI-NEXT:    s_add_i32 s53, s68, 3
 ; SI-NEXT:    s_or_b32 s89, vcc_hi, s89
 ; SI-NEXT:    s_or_b32 s92, s92, s93
@@ -186200,251 +186020,261 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_add_i32 s66, s69, 3
 ; SI-NEXT:    s_or_b32 s93, vcc_hi, s93
 ; SI-NEXT:    s_and_b32 vcc_hi, s66, 0xff
-; SI-NEXT:    s_lshl_b32 s34, s44, 8
-; SI-NEXT:    s_add_i32 s68, s45, 3
+; SI-NEXT:    s_lshl_b32 s34, s45, 8
+; SI-NEXT:    s_add_i32 s68, s6, 3
 ; SI-NEXT:    s_or_b32 vcc_hi, s34, vcc_hi
 ; SI-NEXT:    s_and_b32 s34, s68, 0xff
 ; SI-NEXT:    s_lshl_b32 s39, s40, 8
 ; SI-NEXT:    s_add_i32 s69, s81, 3
 ; SI-NEXT:    s_or_b32 s34, s39, s34
 ; SI-NEXT:    s_and_b32 s39, s69, 0xff
-; SI-NEXT:    s_lshl_b32 s52, s85, 8
+; SI-NEXT:    s_lshl_b32 s52, s21, 8
 ; SI-NEXT:    s_add_i32 s81, s7, 3
 ; SI-NEXT:    s_or_b32 s39, s52, s39
 ; SI-NEXT:    s_and_b32 s52, s81, 0xff
-; SI-NEXT:    s_lshl_b32 s53, s11, 8
+; SI-NEXT:    s_lshl_b32 s53, s97, 8
 ; SI-NEXT:    s_add_i32 s85, s12, 3
 ; SI-NEXT:    s_or_b32 s52, s53, s52
 ; SI-NEXT:    s_and_b32 s53, s85, 0xff
-; SI-NEXT:    s_lshl_b32 s64, s21, 8
-; SI-NEXT:    s_add_i32 s97, s14, 3
+; SI-NEXT:    s_lshl_b32 s64, s11, 8
+; SI-NEXT:    s_add_i32 s97, s56, 3
 ; SI-NEXT:    s_or_b32 s53, s64, s53
 ; SI-NEXT:    s_and_b32 s64, s97, 0xff
 ; SI-NEXT:    s_lshl_b32 s66, s46, 8
-; SI-NEXT:    s_add_i32 s21, s57, 3
+; SI-NEXT:    s_add_i32 s21, s29, 3
 ; SI-NEXT:    s_or_b32 s64, s66, s64
 ; SI-NEXT:    s_and_b32 s21, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s66, s25, 8
+; SI-NEXT:    s_lshl_b32 s66, s59, 8
 ; SI-NEXT:    s_add_i32 s25, s8, 3
 ; SI-NEXT:    s_or_b32 s66, s66, s21
 ; SI-NEXT:    s_and_b32 s21, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s27, 8
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    v_readlane_b32 s16, v62, 14
+; SI-NEXT:    s_lshl_b32 s6, s28, 8
+; SI-NEXT:    s_add_i32 s29, s19, 3
 ; SI-NEXT:    s_or_b32 s67, s6, s21
 ; SI-NEXT:    s_and_b32 s6, s29, 0xff
-; SI-NEXT:    s_lshl_b32 s18, s24, 8
-; SI-NEXT:    s_add_i32 s28, s9, 3
-; SI-NEXT:    s_add_i32 s27, s16, 3
-; SI-NEXT:    v_readlane_b32 s16, v62, 12
+; SI-NEXT:    s_lshl_b32 s18, s26, 8
+; SI-NEXT:    s_add_i32 s28, s17, 3
 ; SI-NEXT:    s_or_b32 s68, s18, s6
 ; SI-NEXT:    s_and_b32 s6, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s18, s88, 8
-; SI-NEXT:    s_add_i32 s7, s23, 3
-; SI-NEXT:    s_lshl_b32 s23, s16, 8
-; SI-NEXT:    v_readlane_b32 s16, v62, 13
+; SI-NEXT:    s_lshl_b32 s18, s23, 8
 ; SI-NEXT:    s_or_b32 s69, s18, s6
+; SI-NEXT:    v_readlane_b32 s6, v62, 17
+; SI-NEXT:    s_add_i32 s7, s6, 3
+; SI-NEXT:    v_readlane_b32 s16, v62, 15
 ; SI-NEXT:    s_and_b32 s6, s7, 0xff
-; SI-NEXT:    s_lshl_b32 s7, s26, 8
-; SI-NEXT:    s_add_i32 s11, s13, 3
-; SI-NEXT:    s_add_i32 s24, s16, 3
-; SI-NEXT:    v_readlane_b32 s16, v62, 10
+; SI-NEXT:    v_readlane_b32 s7, v62, 16
+; SI-NEXT:    s_add_i32 s27, s16, 3
+; SI-NEXT:    v_readlane_b32 s16, v62, 13
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    s_lshl_b32 s23, s16, 8
+; SI-NEXT:    v_readlane_b32 s16, v62, 14
+; SI-NEXT:    s_mov_b32 s91, s24
 ; SI-NEXT:    s_or_b32 s70, s7, s6
-; SI-NEXT:    s_and_b32 s6, s11, 0xff
-; SI-NEXT:    s_lshl_b32 s7, s19, 8
-; SI-NEXT:    s_lshl_b32 s19, s16, 8
+; SI-NEXT:    v_readlane_b32 s6, v62, 20
+; SI-NEXT:    s_add_i32 s24, s16, 3
 ; SI-NEXT:    v_readlane_b32 s16, v62, 11
-; SI-NEXT:    s_or_b32 s71, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 16
+; SI-NEXT:    s_add_i32 s11, s6, 3
+; SI-NEXT:    v_readlane_b32 s7, v62, 18
+; SI-NEXT:    s_lshl_b32 s19, s16, 8
+; SI-NEXT:    v_readlane_b32 s16, v62, 12
+; SI-NEXT:    s_mov_b32 s90, s20
+; SI-NEXT:    s_and_b32 s6, s11, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_add_i32 s20, s16, 3
-; SI-NEXT:    v_readlane_b32 s16, v62, 8
-; SI-NEXT:    s_add_i32 s12, s6, 3
-; SI-NEXT:    s_lshl_b32 s7, s17, 8
+; SI-NEXT:    v_readlane_b32 s16, v62, 9
+; SI-NEXT:    s_or_b32 s71, s7, s6
+; SI-NEXT:    v_readlane_b32 s6, v62, 22
 ; SI-NEXT:    s_and_b32 s20, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s17, s16, 8
-; SI-NEXT:    v_readlane_b32 s16, v62, 9
-; SI-NEXT:    s_and_b32 s6, s12, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v62, 10
+; SI-NEXT:    s_add_i32 s12, s6, 3
+; SI-NEXT:    v_readlane_b32 s7, v62, 19
 ; SI-NEXT:    s_or_b32 s17, s17, s20
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    v_readlane_b32 s20, v62, 7
-; SI-NEXT:    s_or_b32 s81, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 17
+; SI-NEXT:    v_readlane_b32 s20, v62, 8
+; SI-NEXT:    s_and_b32 s6, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s16, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s20, s20, 8
-; SI-NEXT:    s_add_i32 s14, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 15
+; SI-NEXT:    s_or_b32 s81, s7, s6
+; SI-NEXT:    v_readlane_b32 s6, v62, 23
 ; SI-NEXT:    s_and_b32 s24, s24, 0xff
 ; SI-NEXT:    s_or_b32 s16, s20, s16
-; SI-NEXT:    v_readlane_b32 s20, v62, 6
-; SI-NEXT:    s_and_b32 s6, s14, 0xff
-; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    v_readlane_b32 s20, v62, 7
+; SI-NEXT:    s_add_i32 s14, s6, 3
+; SI-NEXT:    v_readlane_b32 s7, v62, 21
 ; SI-NEXT:    s_or_b32 s19, s19, s24
 ; SI-NEXT:    s_add_i32 s98, s20, 3
-; SI-NEXT:    v_readlane_b32 s24, v62, 5
-; SI-NEXT:    s_or_b32 s83, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 20
+; SI-NEXT:    v_readlane_b32 s24, v62, 6
+; SI-NEXT:    s_and_b32 s6, s14, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s20, s98, 0xff
 ; SI-NEXT:    s_lshl_b32 s24, s24, 8
-; SI-NEXT:    s_add_i32 s41, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 18
+; SI-NEXT:    s_or_b32 s83, s7, s6
+; SI-NEXT:    v_readlane_b32 s6, v62, 26
 ; SI-NEXT:    s_and_b32 s27, s27, 0xff
 ; SI-NEXT:    s_or_b32 s20, s24, s20
-; SI-NEXT:    v_readlane_b32 s24, v62, 4
-; SI-NEXT:    s_and_b32 s6, s41, 0xff
-; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    v_readlane_b32 s24, v62, 5
+; SI-NEXT:    s_add_i32 s41, s6, 3
+; SI-NEXT:    v_readlane_b32 s7, v62, 24
 ; SI-NEXT:    s_or_b32 s23, s23, s27
 ; SI-NEXT:    s_add_i32 s86, s24, 3
-; SI-NEXT:    v_readlane_b32 s27, v62, 3
-; SI-NEXT:    s_or_b32 s85, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 22
+; SI-NEXT:    v_readlane_b32 s27, v62, 4
+; SI-NEXT:    s_and_b32 s6, s41, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s24, s86, 0xff
 ; SI-NEXT:    s_lshl_b32 s27, s27, 8
-; SI-NEXT:    s_add_i32 s46, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 19
+; SI-NEXT:    s_or_b32 s85, s7, s6
+; SI-NEXT:    v_readlane_b32 s6, v62, 28
 ; SI-NEXT:    s_or_b32 s24, s27, s24
-; SI-NEXT:    v_readlane_b32 s27, v62, 2
-; SI-NEXT:    s_and_b32 s6, s46, 0xff
-; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    v_readlane_b32 s27, v62, 3
+; SI-NEXT:    s_add_i32 s46, s6, 3
+; SI-NEXT:    v_readlane_b32 s7, v62, 25
 ; SI-NEXT:    s_add_i32 s12, s73, 0x300
 ; SI-NEXT:    s_add_i32 s82, s27, 3
-; SI-NEXT:    v_readlane_b32 s73, v62, 1
-; SI-NEXT:    s_or_b32 s96, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 25
+; SI-NEXT:    v_readlane_b32 s73, v62, 2
+; SI-NEXT:    s_and_b32 s6, s46, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s27, s82, 0xff
 ; SI-NEXT:    s_lshl_b32 s73, s73, 8
-; SI-NEXT:    s_add_i32 s47, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 21
+; SI-NEXT:    s_or_b32 s96, s7, s6
+; SI-NEXT:    v_readlane_b32 s6, v62, 31
 ; SI-NEXT:    s_or_b32 s27, s73, s27
-; SI-NEXT:    v_readlane_b32 s73, v62, 0
-; SI-NEXT:    s_and_b32 s6, s47, 0xff
-; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    v_readlane_b32 s73, v62, 1
+; SI-NEXT:    s_add_i32 s47, s6, 3
+; SI-NEXT:    v_readlane_b32 s7, v62, 27
 ; SI-NEXT:    s_add_i32 s13, s74, 0x300
 ; SI-NEXT:    s_add_i32 s65, s73, 3
-; SI-NEXT:    v_readlane_b32 s74, v61, 63
-; SI-NEXT:    s_or_b32 s97, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 28
+; SI-NEXT:    v_readlane_b32 s74, v62, 0
+; SI-NEXT:    s_and_b32 s6, s47, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s73, s65, 0xff
 ; SI-NEXT:    s_lshl_b32 s74, s74, 8
-; SI-NEXT:    s_add_i32 s56, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 23
+; SI-NEXT:    s_or_b32 s97, s7, s6
+; SI-NEXT:    v_readlane_b32 s6, v62, 34
 ; SI-NEXT:    s_or_b32 s73, s74, s73
-; SI-NEXT:    v_readlane_b32 s74, v61, 62
-; SI-NEXT:    s_and_b32 s6, s56, 0xff
-; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    v_readlane_b32 s74, v61, 63
+; SI-NEXT:    s_add_i32 s56, s6, 3
+; SI-NEXT:    v_readlane_b32 s7, v62, 29
 ; SI-NEXT:    s_add_i32 s14, s75, 0x300
 ; SI-NEXT:    s_add_i32 s54, s74, 3
-; SI-NEXT:    v_readlane_b32 s75, v61, 61
-; SI-NEXT:    s_or_b32 s63, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 30
+; SI-NEXT:    v_readlane_b32 s75, v61, 62
+; SI-NEXT:    s_and_b32 s6, s56, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s74, s54, 0xff
 ; SI-NEXT:    s_lshl_b32 s75, s75, 8
-; SI-NEXT:    s_add_i32 s58, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 24
+; SI-NEXT:    s_or_b32 s63, s7, s6
+; SI-NEXT:    v_readlane_b32 s6, v62, 36
 ; SI-NEXT:    s_or_b32 s74, s75, s74
-; SI-NEXT:    v_readlane_b32 s75, v61, 60
-; SI-NEXT:    s_and_b32 s6, s58, 0xff
-; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    v_readlane_b32 s75, v61, 61
+; SI-NEXT:    s_add_i32 s58, s6, 3
+; SI-NEXT:    v_readlane_b32 s7, v62, 30
 ; SI-NEXT:    s_add_i32 s15, s76, 0x300
 ; SI-NEXT:    s_add_i32 s50, s75, 3
-; SI-NEXT:    v_readlane_b32 s76, v61, 59
-; SI-NEXT:    s_or_b32 s79, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 31
+; SI-NEXT:    v_readlane_b32 s76, v61, 60
+; SI-NEXT:    s_and_b32 s6, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s75, s50, 0xff
 ; SI-NEXT:    s_lshl_b32 s76, s76, 8
-; SI-NEXT:    s_add_i32 s59, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 27
+; SI-NEXT:    s_or_b32 s79, s7, s6
+; SI-NEXT:    v_readlane_b32 s6, v62, 37
 ; SI-NEXT:    s_or_b32 s75, s76, s75
-; SI-NEXT:    v_readlane_b32 s76, v61, 58
-; SI-NEXT:    s_and_b32 s6, s59, 0xff
-; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    v_readlane_b32 s76, v61, 59
+; SI-NEXT:    s_add_i32 s59, s6, 3
+; SI-NEXT:    v_readlane_b32 s7, v62, 33
 ; SI-NEXT:    s_add_i32 s18, s77, 0x300
 ; SI-NEXT:    s_add_i32 s48, s76, 3
-; SI-NEXT:    v_readlane_b32 s77, v61, 57
-; SI-NEXT:    s_or_b32 s78, s7, s6
-; SI-NEXT:    v_readlane_b32 s6, v62, 29
+; SI-NEXT:    v_readlane_b32 s77, v61, 58
+; SI-NEXT:    s_and_b32 s6, s59, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s76, s48, 0xff
 ; SI-NEXT:    s_lshl_b32 s77, s77, 8
-; SI-NEXT:    s_add_i32 s57, s6, 3
-; SI-NEXT:    v_readlane_b32 s7, v62, 26
+; SI-NEXT:    s_or_b32 s78, s7, s6
+; SI-NEXT:    v_readlane_b32 s6, v62, 35
 ; SI-NEXT:    s_or_b32 s76, s77, s76
-; SI-NEXT:    v_readlane_b32 s77, v61, 56
-; SI-NEXT:    s_and_b32 s6, s57, 0xff
-; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    v_readlane_b32 s77, v61, 57
+; SI-NEXT:    s_add_i32 s57, s6, 3
+; SI-NEXT:    v_readlane_b32 s7, v62, 32
 ; SI-NEXT:    s_add_i32 s11, s72, 0x300
 ; SI-NEXT:    s_add_i32 s72, s79, 0x300
 ; SI-NEXT:    s_add_i32 s37, s77, 3
-; SI-NEXT:    v_readlane_b32 s79, v61, 55
-; SI-NEXT:    s_or_b32 s88, s7, s6
+; SI-NEXT:    v_readlane_b32 s79, v61, 56
+; SI-NEXT:    s_and_b32 s6, s57, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_and_b32 s77, s37, 0xff
 ; SI-NEXT:    s_lshl_b32 s79, s79, 8
-; SI-NEXT:    s_add_i32 s35, s35, 3
+; SI-NEXT:    s_or_b32 s88, s7, s6
+; SI-NEXT:    s_or_b32 s77, s79, s77
+; SI-NEXT:    v_readlane_b32 s79, v61, 55
 ; SI-NEXT:    s_add_i32 s21, s89, 0x300
 ; SI-NEXT:    s_add_i32 s89, s88, 0x300
-; SI-NEXT:    s_or_b32 s77, s79, s77
+; SI-NEXT:    s_add_i32 s35, s79, 3
+; SI-NEXT:    v_readlane_b32 s88, v61, 54
 ; SI-NEXT:    s_and_b32 s79, s35, 0xff
-; SI-NEXT:    s_lshl_b32 s88, s36, 8
+; SI-NEXT:    s_lshl_b32 s88, s88, 8
 ; SI-NEXT:    s_or_b32 s79, s88, s79
-; SI-NEXT:    v_readlane_b32 s88, v61, 54
-; SI-NEXT:    s_add_i32 s30, s88, 3
+; SI-NEXT:    v_readlane_b32 s88, v61, 53
 ; SI-NEXT:    s_add_i32 s25, s92, 0x300
+; SI-NEXT:    s_add_i32 s30, s88, 3
+; SI-NEXT:    v_readlane_b32 s92, v61, 52
 ; SI-NEXT:    s_and_b32 s88, s30, 0xff
-; SI-NEXT:    s_lshl_b32 s92, s51, 8
-; SI-NEXT:    s_add_i32 s94, s94, 3
-; SI-NEXT:    v_readlane_b32 s90, v61, 53
+; SI-NEXT:    s_lshl_b32 s92, s92, 8
 ; SI-NEXT:    s_or_b32 s88, s92, s88
+; SI-NEXT:    v_readlane_b32 s92, v61, 51
+; SI-NEXT:    s_add_i32 s94, s92, 3
 ; SI-NEXT:    s_and_b32 s92, s94, 0xff
 ; SI-NEXT:    s_lshl_b32 s91, s91, 8
 ; SI-NEXT:    s_add_i32 s90, s90, 3
 ; SI-NEXT:    s_or_b32 s91, s91, s92
 ; SI-NEXT:    s_and_b32 s90, s90, 0xff
-; SI-NEXT:    s_lshl_b32 s92, s80, 8
+; SI-NEXT:    s_lshl_b32 s92, s31, 8
 ; SI-NEXT:    s_or_b32 s90, s92, s90
-; SI-NEXT:    v_readlane_b32 s92, v61, 52
+; SI-NEXT:    v_readlane_b32 s92, v61, 50
 ; SI-NEXT:    s_add_i32 s92, s92, 3
 ; SI-NEXT:    s_add_i32 s26, s93, 0x300
 ; SI-NEXT:    s_and_b32 s92, s92, 0xff
-; SI-NEXT:    s_lshl_b32 s93, s55, 8
+; SI-NEXT:    s_lshl_b32 s93, s95, 8
 ; SI-NEXT:    s_or_b32 s92, s93, s92
-; SI-NEXT:    v_readlane_b32 s93, v61, 51
+; SI-NEXT:    v_readlane_b32 s93, v61, 49
 ; SI-NEXT:    s_add_i32 s93, s93, 3
 ; SI-NEXT:    s_and_b32 s93, s93, 0xff
-; SI-NEXT:    s_lshl_b32 s94, s95, 8
+; SI-NEXT:    s_lshl_b32 s94, s55, 8
 ; SI-NEXT:    s_or_b32 s93, s94, s93
-; SI-NEXT:    v_readlane_b32 s94, v61, 49
+; SI-NEXT:    v_readlane_b32 s94, v61, 48
 ; SI-NEXT:    s_add_i32 s94, s94, 3
 ; SI-NEXT:    s_and_b32 s94, s94, 0xff
-; SI-NEXT:    s_lshl_b32 s95, s31, 8
+; SI-NEXT:    s_lshl_b32 s95, s99, 8
 ; SI-NEXT:    s_or_b32 s94, s95, s94
-; SI-NEXT:    v_readlane_b32 s95, v61, 50
+; SI-NEXT:    v_readlane_b32 s95, v61, 1
 ; SI-NEXT:    s_add_i32 s95, s95, 3
-; SI-NEXT:    v_readlane_b32 s30, v61, 48
+; SI-NEXT:    v_readlane_b32 s30, v61, 0
 ; SI-NEXT:    s_add_i32 s6, vcc_lo, 0x300
 ; SI-NEXT:    s_and_b32 s95, s95, 0xff
 ; SI-NEXT:    s_lshl_b32 vcc_lo, s30, 8
 ; SI-NEXT:    v_readlane_b32 s30, v61, 47
 ; SI-NEXT:    s_or_b32 s95, vcc_lo, s95
 ; SI-NEXT:    s_add_i32 vcc_lo, s30, 3
-; SI-NEXT:    v_readlane_b32 s30, v61, 46
+; SI-NEXT:    v_readlane_b32 s30, v61, 2
 ; SI-NEXT:    s_add_i32 s28, vcc_hi, 0x300
 ; SI-NEXT:    s_and_b32 vcc_lo, vcc_lo, 0xff
 ; SI-NEXT:    s_lshl_b32 vcc_hi, s30, 8
-; SI-NEXT:    v_readlane_b32 s30, v61, 45
+; SI-NEXT:    v_readlane_b32 s30, v61, 46
 ; SI-NEXT:    s_or_b32 vcc_lo, vcc_hi, vcc_lo
 ; SI-NEXT:    s_add_i32 vcc_hi, s30, 3
-; SI-NEXT:    v_readlane_b32 s30, v61, 44
+; SI-NEXT:    v_readlane_b32 s30, v61, 45
 ; SI-NEXT:    s_and_b32 vcc_hi, vcc_hi, 0xff
 ; SI-NEXT:    s_lshl_b32 s30, s30, 8
 ; SI-NEXT:    s_or_b32 vcc_hi, s30, vcc_hi
-; SI-NEXT:    v_readlane_b32 s30, v61, 43
+; SI-NEXT:    v_readlane_b32 s30, v61, 44
 ; SI-NEXT:    s_add_i32 s30, s30, 3
-; SI-NEXT:    v_readlane_b32 s31, v61, 42
+; SI-NEXT:    v_readlane_b32 s31, v61, 43
 ; SI-NEXT:    s_and_b32 s30, s30, 0xff
 ; SI-NEXT:    s_lshl_b32 s31, s31, 8
 ; SI-NEXT:    s_or_b32 s30, s31, s30
-; SI-NEXT:    v_readlane_b32 s31, v61, 41
+; SI-NEXT:    v_readlane_b32 s31, v61, 42
 ; SI-NEXT:    s_add_i32 s29, s34, 0x300
 ; SI-NEXT:    s_add_i32 s31, s31, 3
-; SI-NEXT:    v_readlane_b32 s34, v61, 40
+; SI-NEXT:    v_readlane_b32 s34, v61, 41
 ; SI-NEXT:    s_and_b32 s31, s31, 0xff
 ; SI-NEXT:    s_lshl_b32 s34, s34, 8
 ; SI-NEXT:    s_or_b32 s31, s34, s31
@@ -186452,25 +186282,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s31
 ; SI-NEXT:    s_addk_i32 s30, 0x300
 ; SI-NEXT:    s_addk_i32 vcc_hi, 0x300
-; SI-NEXT:    v_readlane_b32 s34, v61, 39
+; SI-NEXT:    v_readlane_b32 s34, v61, 40
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s30
 ; SI-NEXT:    s_add_i32 s34, s34, 3
-; SI-NEXT:    v_readlane_b32 s35, v61, 38
+; SI-NEXT:    v_readlane_b32 s35, v61, 39
 ; SI-NEXT:    s_and_b32 s34, s34, 0xff
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, vcc_hi
 ; SI-NEXT:    s_lshl_b32 s35, s35, 8
-; SI-NEXT:    s_addk_i32 s95, 0x300
+; SI-NEXT:    s_addk_i32 vcc_lo, 0x300
 ; SI-NEXT:    s_or_b32 s34, s35, s34
-; SI-NEXT:    v_readlane_b32 s35, v61, 37
+; SI-NEXT:    v_readlane_b32 s35, v61, 38
 ; SI-NEXT:    s_add_i32 s35, s35, 3
-; SI-NEXT:    v_readlane_b32 s36, v61, 36
+; SI-NEXT:    v_readlane_b32 s36, v61, 37
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s95
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, vcc_lo
 ; SI-NEXT:    s_and_b32 s35, s35, 0xff
 ; SI-NEXT:    s_lshl_b32 s36, s36, 8
 ; SI-NEXT:    s_or_b32 s35, s36, s35
@@ -186517,19 +186347,19 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_addk_i32 s92, 0x300
 ; SI-NEXT:    s_addk_i32 s93, 0x300
 ; SI-NEXT:    s_addk_i32 s94, 0x300
-; SI-NEXT:    s_addk_i32 vcc_lo, 0x300
+; SI-NEXT:    s_addk_i32 s95, 0x300
 ; SI-NEXT:    s_addk_i32 s34, 0x300
 ; SI-NEXT:    s_addk_i32 s35, 0x300
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, s35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s34
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, vcc_lo
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s95
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, s94
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, s93
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, s92
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, s92
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, s90
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, s91
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, s88
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, s88
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, s79
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, s77
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, s76
@@ -186582,7 +186412,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT:  .LBB93_5: ; %end
+; SI-NEXT:  .LBB93_3: ; %end
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -186637,26 +186467,26 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 12, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v7
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v8
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -186671,7 +186501,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v12
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
@@ -186869,6 +186699,134 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:  .LBB93_4:
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    s_mov_b32 s17, s19
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    s_mov_b32 s19, s50
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    s_mov_b32 s23, s48
+; SI-NEXT:    s_mov_b32 s26, s37
+; SI-NEXT:    s_mov_b32 s28, s29
+; SI-NEXT:    s_mov_b32 s29, s76
+; SI-NEXT:    s_mov_b32 s59, s58
+; SI-NEXT:    s_mov_b32 s56, s47
+; SI-NEXT:    s_mov_b32 s46, s41
+; SI-NEXT:    s_mov_b32 s12, s11
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_mov_b32 s7, s97
+; SI-NEXT:    s_mov_b32 s97, s81
+; SI-NEXT:    s_mov_b32 s81, s85
+; SI-NEXT:    s_mov_b32 s6, s40
+; SI-NEXT:    s_mov_b32 s40, s72
+; SI-NEXT:    s_mov_b32 s45, s73
+; SI-NEXT:    s_mov_b32 s15, s89
+; SI-NEXT:    s_mov_b32 s24, s98
+; SI-NEXT:    s_mov_b32 s20, s88
+; SI-NEXT:    s_mov_b32 s99, s55
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    v_readlane_b32 s75, v61, 7
+; SI-NEXT:    v_readlane_b32 s76, v61, 3
+; SI-NEXT:    v_readlane_b32 s77, v61, 5
+; SI-NEXT:    v_readlane_b32 s78, v61, 4
+; SI-NEXT:    v_readlane_b32 s92, v61, 6
+; SI-NEXT:    v_readlane_b32 s39, v61, 9
+; SI-NEXT:    v_readlane_b32 s37, v61, 8
+; SI-NEXT:    v_readlane_b32 s30, v61, 10
+; SI-NEXT:    v_readlane_b32 s48, v61, 11
+; SI-NEXT:    v_readlane_b32 s52, v61, 13
+; SI-NEXT:    v_readlane_b32 s35, v61, 12
+; SI-NEXT:    v_readlane_b32 s50, v61, 15
+; SI-NEXT:    v_readlane_b32 s64, v61, 14
+; SI-NEXT:    v_readlane_b32 s54, v61, 17
+; SI-NEXT:    v_readlane_b32 s67, v61, 16
+; SI-NEXT:    v_readlane_b32 s65, v61, 18
+; SI-NEXT:    v_readlane_b32 s70, v61, 19
+; SI-NEXT:    v_readlane_b32 s49, v61, 21
+; SI-NEXT:    v_readlane_b32 s71, v61, 20
+; SI-NEXT:    v_readlane_b32 s80, v61, 23
+; SI-NEXT:    v_readlane_b32 s83, v61, 22
+; SI-NEXT:    v_readlane_b32 s84, v61, 25
+; SI-NEXT:    v_readlane_b32 s82, v61, 24
+; SI-NEXT:    v_readlane_b32 s87, v61, 26
+; SI-NEXT:    v_readlane_b32 s86, v61, 27
+; SI-NEXT:    v_readlane_b32 s96, v61, 29
+; SI-NEXT:    v_readlane_b32 s51, v61, 28
+; SI-NEXT:    s_mov_b32 s55, s93
+; SI-NEXT:    s_mov_b32 s95, s91
+; SI-NEXT:    v_readlane_b32 s94, v61, 31
+; SI-NEXT:    s_mov_b32 s31, s90
+; SI-NEXT:    v_readlane_b32 s34, v61, 30
+; SI-NEXT:    v_readlane_b32 s53, v61, 32
+; SI-NEXT:    v_readlane_b32 s66, v61, 33
+; SI-NEXT:    v_readlane_b32 s68, v61, 34
+; SI-NEXT:    v_readlane_b32 s69, v61, 35
+; SI-NEXT:    v_readlane_b32 s8, v61, 36
+; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr60
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    s_branch .LBB93_2
 ;
 ; VI-LABEL: bitcast_v128i8_to_v64f16_scalar:
 ; VI:       ; %bb.0:
@@ -186930,14 +186888,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:168
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:176
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v40, 8, v27
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
@@ -186949,42 +186906,46 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v29
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v27
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v29
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v10
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v12
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v10
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v24
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v12
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v26
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v28
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v30
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v32
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v34
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v35
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v36
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v37
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v38
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:184
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:192
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:200
@@ -186993,37 +186954,34 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:232
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v26
-; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v28
+; VI-NEXT:    v_lshlrev_b32_e32 v45, 8, v22
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
-; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b32_e32 v24, 8, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v24, 8, v2
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -187042,11 +187000,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:328
@@ -187055,8 +187008,12 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:36
+; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
+; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 8, v0
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:44
@@ -187065,45 +187022,47 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:116
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:172
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:180
 ; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:196
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:212
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:260
 ; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:268
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:324
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:308
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
@@ -187113,50 +187072,46 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB93_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -187173,10 +187128,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -187200,7 +187156,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v17, v10
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
@@ -187217,43 +187172,38 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v40, v42
-; VI-NEXT:    v_mov_b32_e32 v42, v44
-; VI-NEXT:    v_mov_b32_e32 v44, v45
-; VI-NEXT:    v_mov_b32_e32 v45, v62
-; VI-NEXT:    v_or_b32_sdwa v2, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v3, v53, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v34, v24
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -187261,74 +187211,77 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v0, v48, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v45, v62
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v32, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v54, v22
+; VI-NEXT:    v_mov_b32_e32 v41, v24
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v33, v0
+; VI-NEXT:    v_mov_b32_e32 v34, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v37, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v50, v26
+; VI-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v55, v26
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v39, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v49, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v43, v27
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v51, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v53, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v52, v28
+; VI-NEXT:    v_mov_b32_e32 v35, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v53, v28
 ; VI-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v1, v47, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v55, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v33, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v35, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v41, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v63, v27
-; VI-NEXT:    v_mov_b32_e32 v46, v57
+; VI-NEXT:    v_mov_b32_e32 v36, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v36, v0
+; VI-NEXT:    v_mov_b32_e32 v56, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v56, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v61, v59
+; VI-NEXT:    v_mov_b32_e32 v58, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v61, v60
+; VI-NEXT:    v_mov_b32_e32 v60, v59
 ; VI-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -187340,53 +187293,55 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v1, v44, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v58, v0
+; VI-NEXT:    v_mov_b32_e32 v50, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v43, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v52, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v0, v59, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v60, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v46, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v54, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v63, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v47, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_mov_b32_e32 v57, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
@@ -187418,10 +187373,12 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    s_cbranch_execnz .LBB93_3
 ; VI-NEXT:  .LBB93_2: ; %cmp.true
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v59
+; VI-NEXT:    v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
@@ -187440,147 +187397,165 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    s_lshl_b32 s9, s19, 8
 ; VI-NEXT:    s_add_i32 s16, s16, 3
 ; VI-NEXT:    s_lshl_b32 s10, s17, 8
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v28, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v59
-; VI-NEXT:    v_or_b32_sdwa v25, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v62
-; VI-NEXT:    v_or_b32_sdwa v27, v63, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v45
-; VI-NEXT:    v_or_b32_sdwa v52, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v44
-; VI-NEXT:    v_or_b32_sdwa v26, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v45
+; VI-NEXT:    v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v42
-; VI-NEXT:    v_or_b32_sdwa v63, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v40
-; VI-NEXT:    v_or_b32_sdwa v43, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v61
+; VI-NEXT:    v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v60
 ; VI-NEXT:    v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v46
-; VI-NEXT:    v_or_b32_sdwa v24, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v61
+; VI-NEXT:    v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v48, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v48, vcc, 0x300, v48
 ; VI-NEXT:    v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 0x3000000, v24
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v38, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v38, vcc, 0x300, v38
 ; VI-NEXT:    v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x3000000, v23
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v22, v34, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v36, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v50, vcc, 0x300, v36
+; VI-NEXT:    v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v50, vcc, 0x300, v50
 ; VI-NEXT:    v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x3000000, v22
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v21, v53, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v53, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v49, vcc, 0x300, v49
 ; VI-NEXT:    v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v20, vcc, 0x3000000, v20
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v37, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v37, vcc, 0x300, v37
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT:    v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x3000000, v19
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v58, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x300, v58
-; VI-NEXT:    v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x300, v57
+; VI-NEXT:    v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x3000000, v18
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x300, v34
 ; VI-NEXT:    v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x3000000, v14
@@ -187589,78 +187564,67 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:    v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v35, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x300, v35
-; VI-NEXT:    v_or_b32_sdwa v13, v13, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x300, v25
-; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x300, v59
-; VI-NEXT:    v_or_b32_sdwa v25, v43, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x300, v36
+; VI-NEXT:    v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x300, v26
+; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x300, v52
+; VI-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v52, vcc, 0x300, v54
+; VI-NEXT:    v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x3000000, v13
-; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x3000000, v25
-; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x3000000, v28
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x3000000, v21
+; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x3000000, v26
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT:    v_or_b32_sdwa v30, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x300, v51
 ; VI-NEXT:    v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x300, v1
-; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x300, v63
-; VI-NEXT:    v_or_b32_sdwa v26, v26, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x300, v59
+; VI-NEXT:    v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x3000000, v12
-; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x3000000, v26
-; VI-NEXT:    v_add_u32_e32 v30, vcc, 0x3000000, v30
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x3000000, v25
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v57, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT:    v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT:    v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x300, v2
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -187684,14 +187648,15 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_add_u32_e32 v41, vcc, 0x300, v10
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x300, v55
 ; VI-NEXT:    v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v39, vcc, 0x300, v52
-; VI-NEXT:    v_add_u32_e32 v52, vcc, 0x300, v53
-; VI-NEXT:    v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_or_b32_sdwa v27, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v39, vcc, 0x300, v53
+; VI-NEXT:    v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x3000000, v9
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x3000000, v10
-; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x3000000, v21
 ; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x3000000, v27
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x3000000, v28
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x3000000, v29
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -187707,14 +187672,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_add_u32_e32 v42, vcc, 0x300, v42
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v42, vcc, 0x300, v11
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x300, v40
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x300, v1
+; VI-NEXT:    v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x3000000, v8
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x3000000, v11
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v30, vcc, 0x3000000, v30
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
@@ -187754,29 +187723,19 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT:    v_or_b32_sdwa v29, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v46, vcc, 0x300, v46
 ; VI-NEXT:    v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x3000000, v5
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x300, v2
-; VI-NEXT:    v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x3000000, v29
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v4
 ; VI-NEXT:    v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v47, vcc, 3, v32
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x300, v4
 ; VI-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x3000000, v4
@@ -187843,38 +187802,35 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB93_4:
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v61, v59
-; VI-NEXT:    v_mov_b32_e32 v46, v57
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v40, v42
-; VI-NEXT:    v_mov_b32_e32 v42, v44
-; VI-NEXT:    v_mov_b32_e32 v44, v45
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v61, v60
+; VI-NEXT:    v_mov_b32_e32 v60, v59
 ; VI-NEXT:    v_mov_b32_e32 v45, v62
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v47, v4
-; VI-NEXT:    v_mov_b32_e32 v54, v3
-; VI-NEXT:    v_mov_b32_e32 v52, v28
-; VI-NEXT:    v_mov_b32_e32 v63, v27
-; VI-NEXT:    v_mov_b32_e32 v50, v26
-; VI-NEXT:    v_mov_b32_e32 v34, v24
+; VI-NEXT:    v_mov_b32_e32 v63, v3
+; VI-NEXT:    v_mov_b32_e32 v53, v28
+; VI-NEXT:    v_mov_b32_e32 v43, v27
+; VI-NEXT:    v_mov_b32_e32 v55, v26
+; VI-NEXT:    v_mov_b32_e32 v41, v24
+; VI-NEXT:    v_mov_b32_e32 v54, v22
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_branch .LBB93_2
@@ -187936,18 +187892,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 8, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 8, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 8, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 8, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 8, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 8, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 8, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
 ; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v43
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
@@ -187976,10 +187932,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v52
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v51
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
@@ -187991,7 +187947,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v39
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
@@ -188039,7 +187995,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
@@ -188066,23 +188022,23 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v15
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
-; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
@@ -188095,49 +188051,48 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
 ; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:92
 ; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:100
 ; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:140
 ; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:156
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(21)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
 ; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:276
 ; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:284
 ; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:316
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
@@ -188148,54 +188103,55 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(28)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(36)
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(38)
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
@@ -188205,7 +188161,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB93_2
@@ -188218,7 +188174,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -188255,10 +188211,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
@@ -188274,13 +188230,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v9, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -188288,7 +188244,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -188329,8 +188285,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v52, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v1, 16, v0
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v50, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -188348,16 +188304,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v48, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v17, v17, 16, v1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_mov_b32_e32 v33, v46
+; GFX9-NEXT:    v_mov_b32_e32 v33, v45
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
@@ -188370,7 +188326,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -188379,7 +188335,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -188387,122 +188343,121 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_mov_b32_e32 v40, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v46, v0
+; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v35, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v35, v45
+; GFX9-NEXT:    v_mov_b32_e32 v45, v61
+; GFX9-NEXT:    v_mov_b32_e32 v61, v42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v38, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
+; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v54, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_mov_b32_e32 v41, v57
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v60, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_or_b32_sdwa v1, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v57, v59
+; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v56, v42
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB93_3
 ; GFX9-NEXT:  .LBB93_2:
 ; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v33, v46
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v33, v45
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v56, v61
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:  .LBB93_3: ; %Flow
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -188705,7 +188660,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -188765,11 +188720,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT:    v_or_b32_sdwa v48, v40, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT:    v_or_b32_sdwa v49, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v26
@@ -188804,7 +188759,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_add_u32_e32 v24, 3, v24
-; GFX9-NEXT:    v_add_u32_e32 v26, 3, v62
+; GFX9-NEXT:    v_add_u32_e32 v26, 3, v61
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v36, 0x300, v24
 ; GFX9-NEXT:    v_add_u32_e32 v24, 0x300, v48
@@ -188813,7 +188768,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v26, 3, v61
+; GFX9-NEXT:    v_add_u32_e32 v26, 3, v45
 ; GFX9-NEXT:    v_add_u32_e32 v20, 3, v20
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v34, 0x300, v20
@@ -188822,7 +188777,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v56
 ; GFX9-NEXT:    v_add_u32_e32 v21, 3, v21
-; GFX9-NEXT:    v_or_b32_sdwa v21, v45, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v28, 0x300, v21
 ; GFX9-NEXT:    v_add_u32_e32 v21, 0x300, v54
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff, v21
@@ -190678,207 +190633,204 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:64
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:64
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:72
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:92
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v8
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v46, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v4
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v12
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v28
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v7
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v10
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v20
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v11
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v28
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v14
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v24
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v15
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v18
 ; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v17
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v35
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v34
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, v50
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v44
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v39
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v51
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v62
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v54
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v61
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v42
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v44
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v23
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v45
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; kill: killed $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; kill: killed $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v25
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v27
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v30
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v29
 ; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v37
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
 ; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v39
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v51
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
 ; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v60
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v40
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v63
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v46
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v1
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v3
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v7
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, v8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v14
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v14
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, v15
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v15
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v16
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v17
 ; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v14
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; kill: killed $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr14
@@ -191002,30 +190954,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; kill: killed $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v15
 ; SI-NEXT:    ; kill: killed $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; kill: killed $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; kill: killed $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; kill: killed $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; kill: killed $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; kill: killed $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; kill: killed $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; kill: killed $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; kill: killed $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; kill: killed $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; kill: killed $vgpr15
+; SI-NEXT:    ; kill: killed $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; kill: killed $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
+; SI-NEXT:    ; kill: killed $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; kill: killed $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; kill: killed $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr14
@@ -191035,400 +190985,399 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; kill: killed $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    v_mov_b32_e32 v45, v46
+; SI-NEXT:    v_mov_b32_e32 v46, v6
+; SI-NEXT:    v_mov_b32_e32 v6, v5
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; kill: killed $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; kill: killed $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; kill: killed $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; kill: killed $vgpr5
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB94_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v16, v46
-; SI-NEXT:    v_mov_b32_e32 v42, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v20, v1, v2
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_or_b32_e32 v44, v15, v14
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v46
-; SI-NEXT:    v_mov_b32_e32 v46, v33
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v41, v15, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v54, v15, v14
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v44, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v32
-; SI-NEXT:    v_mov_b32_e32 v32, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v53, v15, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v41, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v51, v15, v14
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v54, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v53, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v51, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v31
-; SI-NEXT:    v_mov_b32_e32 v31, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v52, v15, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v52, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v49, v15, v14
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; SI-NEXT:    v_mov_b32_e32 v13, v12
-; SI-NEXT:    v_mov_b32_e32 v12, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v50, v15, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v48, v15, v14
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; SI-NEXT:    v_or_b32_e32 v49, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v39, v15, v14
-; SI-NEXT:    v_alignbit_b32 v14, v41, v44, 24
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v50, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v37, v15, v14
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v48, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v38, v15, v14
-; SI-NEXT:    v_alignbit_b32 v14, v41, v44, 16
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v39, v5, v14
+; SI-NEXT:    v_alignbit_b32 v5, v41, v44, 24
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v35, v15, v14
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v37, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v36, v15, v14
-; SI-NEXT:    v_alignbit_b32 v14, v41, v44, 8
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v38, v5, v14
+; SI-NEXT:    v_alignbit_b32 v5, v41, v44, 16
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v30, v15, v14
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v34, v15, v14
-; SI-NEXT:    v_alignbit_b32 v14, v53, v54, 24
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v35, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v36, v5, v14
+; SI-NEXT:    v_alignbit_b32 v5, v41, v44, 8
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v28, v14, v9
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v30, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v34, v5, v14
+; SI-NEXT:    v_alignbit_b32 v5, v53, v54, 24
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v28, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v29, v5, v14
+; SI-NEXT:    v_alignbit_b32 v5, v53, v54, 16
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v29, v14, v9
-; SI-NEXT:    v_alignbit_b32 v9, v53, v54, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v26, v5, v14
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v27, v5, v14
+; SI-NEXT:    v_alignbit_b32 v5, v53, v54, 8
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v45
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v26, v43, v9
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v56
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v24, v47, v14
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v45
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v27, v14, v9
-; SI-NEXT:    v_alignbit_b32 v9, v53, v54, 8
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v25, v5, v14
+; SI-NEXT:    v_alignbit_b32 v5, v52, v51, 24
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v56
-; SI-NEXT:    v_or_b32_e32 v24, v47, v9
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v43
-; SI-NEXT:    v_or_b32_e32 v25, v10, v9
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v9, v52, v51, 24
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v59
+; SI-NEXT:    v_or_b32_e32 v22, v58, v5
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v56
+; SI-NEXT:    v_or_b32_e32 v23, v57, v5
+; SI-NEXT:    v_alignbit_b32 v5, v52, v51, 16
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v59
-; SI-NEXT:    v_or_b32_e32 v22, v58, v9
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v62
+; SI-NEXT:    v_or_b32_e32 v20, v61, v5
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v47
+; SI-NEXT:    v_or_b32_e32 v21, v60, v5
+; SI-NEXT:    v_alignbit_b32 v5, v52, v51, 8
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v63
+; SI-NEXT:    v_or_b32_e32 v18, v40, v5
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; SI-NEXT:    v_or_b32_e32 v23, v57, v9
-; SI-NEXT:    v_alignbit_b32 v9, v52, v51, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr60
+; SI-NEXT:    ; implicit-def: $vgpr63
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; SI-NEXT:    v_or_b32_e32 v21, v60, v1
-; SI-NEXT:    v_alignbit_b32 v1, v52, v51, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v40
+; SI-NEXT:    v_or_b32_e32 v19, v55, v5
+; SI-NEXT:    v_alignbit_b32 v5, v50, v49, 24
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v61
-; SI-NEXT:    v_or_b32_e32 v18, v4, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_or_b32_e32 v19, v3, v1
-; SI-NEXT:    v_alignbit_b32 v1, v50, v49, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_or_b32_e32 v14, v63, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v42
-; SI-NEXT:    v_or_b32_e32 v17, v62, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v43
+; SI-NEXT:    v_or_b32_e32 v16, v1, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v46
+; SI-NEXT:    v_or_b32_e32 v17, v42, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v50, v49, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT:    v_or_b32_e32 v3, v8, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v40
-; SI-NEXT:    v_or_b32_e32 v15, v6, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    v_or_b32_e32 v14, v4, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v45
+; SI-NEXT:    v_or_b32_e32 v15, v2, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v50, v49, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v39, v48, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v39, v48, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v39, v48, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v37, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v37, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v37, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v36, v35, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v36, v35, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v36, v35, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v34, v30, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v34, v30, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v34, v30, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v29, v28, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v29, v28, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v29, v28, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v27, v26, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v27, v26, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v27, v26, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v25, v24, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v25, v24, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v25, v24, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v23, v22, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v23, v22, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v23, v22, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v21, v20, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v21, v20, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v21, v20, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v19, v18, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v19, v18, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v19, v18, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v17, v14, 24
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v17, v14, 16
+; SI-NEXT:    v_alignbit_b32 v1, v19, v18, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v17, v14, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v17, v16, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v15, v3, 24
+; SI-NEXT:    v_alignbit_b32 v1, v17, v16, 16
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v15, v3, 16
+; SI-NEXT:    v_alignbit_b32 v1, v17, v16, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v15, v3, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v15, v14, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v1, v15, v14, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v1, v15, v14, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v41
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v53
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v52
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v50
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v39
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v36
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v34
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v29
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v27
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v25
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v23
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v21
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v19
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v17
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v15
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v16, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v46, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v33, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v32, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v31, 8, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v11, 8, 8
-; SI-NEXT:    v_mov_b32_e32 v11, v12
-; SI-NEXT:    v_mov_b32_e32 v12, v13
-; SI-NEXT:    v_mov_b32_e32 v13, v31
-; SI-NEXT:    v_mov_b32_e32 v31, v32
-; SI-NEXT:    v_mov_b32_e32 v32, v33
+; SI-NEXT:    v_bfe_u32 v1, v31, 8, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v32, 8, 8
+; SI-NEXT:    v_bfe_u32 v1, v12, 8, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v12, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v9, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_bfe_u32 v1, v13, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v11, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v55, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v10, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v45, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v8, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v43, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v7, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v10, 8, 8
+; SI-NEXT:    v_bfe_u32 v1, v6, 8, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v9, 8, 8
-; SI-NEXT:    v_mov_b32_e32 v55, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v42
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v56, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v2, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v47, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v40, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_bfe_u32 v1, v46, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_bfe_u32 v1, v45, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -191482,209 +191431,210 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v33, v46
-; SI-NEXT:    v_mov_b32_e32 v46, v16
-; SI-NEXT:    v_mov_b32_e32 v16, v3
-; SI-NEXT:    v_bfe_u32 v42, v55, 8, 8
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:  .LBB94_2: ; %Flow
-; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB94_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v16, v8, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v55
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v55
-; SI-NEXT:    v_or_b32_e32 v15, v6, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v63
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v14, v4, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v45
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_or_b32_e32 v14, v6, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v62
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v5
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_or_b32_e32 v20, v1, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v47
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_or_b32_e32 v17, v5, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v61
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v60
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v18, v4, v5
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
+; SI-NEXT:    v_or_b32_e32 v15, v2, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v43
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v16, v1, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v42
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v46
+; SI-NEXT:    v_or_b32_e32 v17, v1, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v40
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v18, v2, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v55
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    v_or_b32_e32 v19, v1, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v61
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v20, v2, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v60
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v55
 ; SI-NEXT:    v_or_b32_e32 v21, v1, v2
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v58
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; SI-NEXT:    v_or_b32_e32 v19, v3, v4
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v47
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v22, v2, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v57
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v56
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v42
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v23, v1, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v10
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v6
 ; SI-NEXT:    v_or_b32_e32 v24, v3, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v45
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v25, v2, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v43
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v7
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v26, v2, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v9
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_or_b32_e32 v28, v4, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_or_b32_e32 v27, v3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v8
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v1
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v29, v1, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v10
 ; SI-NEXT:    v_or_b32_e32 v30, v3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v34, v2, v3
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v12
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v11
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -191697,24 +191647,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v35, v2, v1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v12
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v37, v4, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_or_b32_e32 v36, v3, v1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v32
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v13
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v1
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -191724,36 +191674,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
 ; SI-NEXT:    v_or_b32_e32 v38, v1, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v9
 ; SI-NEXT:    v_or_b32_e32 v48, v3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v39, v2, v3
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v13
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v12
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -191764,21 +191712,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v49, v2, v1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v13
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v51, v4, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_or_b32_e32 v50, v3, v1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v31
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v1
@@ -191794,30 +191742,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v31
 ; SI-NEXT:    v_or_b32_e32 v52, v1, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v32
 ; SI-NEXT:    v_or_b32_e32 v54, v3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v32
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v53, v2, v3
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v33
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v46, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -191828,279 +191776,274 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v44, v2, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v41, v3, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v41, v44, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v41, v44, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v41, v44, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v53, v54, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v53, v54, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v53, v54, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v52, v51, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v52, v51, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v52, v51, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v50, v49, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v50, v49, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v50, v49, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v39, v48, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v39, v48, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v39, v48, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v37, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v37, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v37, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v36, v35, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v36, v35, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v36, v35, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v34, v30, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v34, v30, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v34, v30, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v29, v28, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v29, v28, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v29, v28, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v27, v26, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v27, v26, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v27, v26, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v25, v24, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v25, v24, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v25, v24, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v23, v22, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v23, v22, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v23, v22, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v21, v20, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v21, v20, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v21, v20, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v19, v18, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v19, v18, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v19, v18, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v17, v14, 24
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v17, v14, 16
+; SI-NEXT:    v_alignbit_b32 v1, v19, v18, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v17, v14, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v17, v16, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v15, v16, 24
+; SI-NEXT:    v_alignbit_b32 v1, v17, v16, 16
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v15, v16, 16
+; SI-NEXT:    v_alignbit_b32 v1, v17, v16, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v15, v16, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v15, v14, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v1, v15, v14, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v1, v15, v14, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v41
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v53
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v52
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v50
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v39
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v36
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v34
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v29
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v27
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v25
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v23
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v21
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v19
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v17
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v15
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v46, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v33, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v31, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v13, 8, 8
+; SI-NEXT:    v_bfe_u32 v1, v32, 8, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_bfe_u32 v1, v10, 8, 8
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_bfe_u32 v1, v31, 8, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v32, 8, 8
+; SI-NEXT:    v_bfe_u32 v1, v12, 8, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v12, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v9, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_bfe_u32 v1, v13, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v11, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_bfe_u32 v1, v9, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_bfe_u32 v1, v10, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v8, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v7, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v6, 8, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_bfe_u32 v1, v42, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_bfe_u32 v1, v55, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_bfe_u32 v1, v5, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v40, 8, 8
-; SI-NEXT:    v_bfe_u32 v42, v55, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_bfe_u32 v1, v46, 8, 8
+; SI-NEXT:    v_alignbit_b32 v5, v41, v44, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_bfe_u32 v1, v45, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB94_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v44
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v41
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v46
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192109,14 +192052,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v54
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192127,14 +192070,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v53
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192143,14 +192086,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v51
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192161,8 +192104,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v52
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -192177,14 +192120,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v49
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192195,14 +192138,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v50
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v13
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192211,14 +192154,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v48
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192229,32 +192172,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192265,14 +192206,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v38
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192281,14 +192222,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192299,14 +192240,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v36
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v12
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192315,14 +192256,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v30
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192333,14 +192274,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v34
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v11
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192349,14 +192290,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v28
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192367,32 +192308,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v29
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v26
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192403,35 +192342,33 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v27
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v24
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -192439,32 +192376,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v25
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v22
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -192475,14 +192410,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v23
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -192493,14 +192428,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v20
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -192511,14 +192446,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v21
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -192529,14 +192464,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -192547,14 +192482,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v19
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -192565,14 +192500,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v16
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -192583,14 +192518,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v40
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v46
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
@@ -192599,14 +192534,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -192617,16 +192552,18 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v42
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v55
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v45
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -192670,56 +192607,54 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v10
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v28
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v28
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v26
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v23
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v22
-; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v21
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v20
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v18
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v24
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v22
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v18
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v17
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr42
 ; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; VI-NEXT:    s_waitcnt vmcnt(12)
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v62
-; VI-NEXT:    s_waitcnt vmcnt(11)
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v61
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
@@ -192772,7 +192707,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
@@ -192816,30 +192750,31 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr32
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr32
+; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v61
+; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v60
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr32
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr32
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr32
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr32
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr32
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -192854,773 +192789,770 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    s_cbranch_execz .LBB94_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v16
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v16
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v15
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v14
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v12
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v11
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v10
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v10
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v9
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v8
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v8
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v6
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v4
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v4
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v3
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v2
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v2
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v1
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v62
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v62
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v61
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v61
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v60
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v30
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v30
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v29
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v28
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v28
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v27
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v26
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v26
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v25
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v24
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v24
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v23
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v22
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v22
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v21
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v20
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v20
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v19
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v18
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v18
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v17
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v31, v43
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v31, v33
+; VI-NEXT:    v_mov_b32_e32 v33, v43
 ; VI-NEXT:    v_lshrrev_b64 v[42:43], 24, v[5:6]
-; VI-NEXT:    v_mov_b32_e32 v32, v57
-; VI-NEXT:    v_lshrrev_b64 v[56:57], 24, v[3:4]
-; VI-NEXT:    v_mov_b32_e32 v43, v31
-; VI-NEXT:    v_mov_b32_e32 v57, v32
-; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[1:2]
-; VI-NEXT:    v_lshrrev_b64 v[32:33], 24, v[61:62]
+; VI-NEXT:    v_mov_b32_e32 v43, v33
+; VI-NEXT:    v_mov_b32_e32 v33, v46
+; VI-NEXT:    v_lshrrev_b64 v[45:46], 24, v[3:4]
+; VI-NEXT:    v_mov_b32_e32 v46, v33
+; VI-NEXT:    v_mov_b32_e32 v33, v53
+; VI-NEXT:    v_lshrrev_b64 v[52:53], 24, v[1:2]
+; VI-NEXT:    v_mov_b32_e32 v53, v33
+; VI-NEXT:    v_lshrrev_b64 v[32:33], 24, v[60:61]
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[32:33], 24, v[29:30]
+; VI-NEXT:    v_lshrrev_b64 v[34:35], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[59:60], 24, v[27:28]
-; VI-NEXT:    v_mov_b32_e32 v32, v35
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v33, v36
 ; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v35, v32
-; VI-NEXT:    v_mov_b32_e32 v32, v41
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
-; VI-NEXT:    v_mov_b32_e32 v41, v32
-; VI-NEXT:    v_mov_b32_e32 v32, v53
-; VI-NEXT:    v_lshrrev_b64 v[52:53], 24, v[21:22]
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 24, v18
-; VI-NEXT:    v_mov_b32_e32 v53, v32
-; VI-NEXT:    v_mov_b32_e32 v32, v34
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[19:20]
-; VI-NEXT:    v_mov_b32_e32 v59, v50
-; VI-NEXT:    v_mov_b32_e32 v34, v32
-; VI-NEXT:    v_mov_b32_e32 v32, v51
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[23:24]
+; VI-NEXT:    v_mov_b32_e32 v36, v33
+; VI-NEXT:    v_mov_b32_e32 v33, v41
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
+; VI-NEXT:    v_mov_b32_e32 v34, v51
 ; VI-NEXT:    v_lshrrev_b64 v[50:51], 24, v[17:18]
-; VI-NEXT:    v_mov_b32_e32 v51, v32
+; VI-NEXT:    v_mov_b32_e32 v41, v33
+; VI-NEXT:    v_mov_b32_e32 v33, v31
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[19:20]
+; VI-NEXT:    v_mov_b32_e32 v51, v34
 ; VI-NEXT:  .LBB94_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB94_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v55, 0x200
-; VI-NEXT:    v_add_f16_sdwa v31, v18, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v63, 0x200
+; VI-NEXT:    v_add_f16_sdwa v31, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v18, 0x200, v18
 ; VI-NEXT:    v_or_b32_e32 v32, v18, v31
-; VI-NEXT:    v_add_f16_sdwa v31, v17, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v31, v17, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v17, 0x200, v17
 ; VI-NEXT:    v_or_b32_e32 v31, v17, v31
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_f16_sdwa v31, v20, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v31, v20, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v20, 0x200, v20
 ; VI-NEXT:    v_or_b32_e32 v32, v20, v31
-; VI-NEXT:    v_add_f16_sdwa v31, v19, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v31, v19, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v19, 0x200, v19
 ; VI-NEXT:    v_or_b32_e32 v31, v19, v31
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_f16_sdwa v31, v22, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v34, v22, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v34
 ; VI-NEXT:    v_add_f16_e32 v22, 0x200, v22
 ; VI-NEXT:    v_or_b32_e32 v32, v22, v31
-; VI-NEXT:    v_add_f16_sdwa v31, v21, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v31, v21, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v21, 0x200, v21
 ; VI-NEXT:    v_or_b32_e32 v31, v21, v31
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_f16_sdwa v31, v24, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v31, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
 ; VI-NEXT:    v_or_b32_e32 v32, v24, v31
-; VI-NEXT:    v_add_f16_sdwa v31, v23, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v31, v23, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v23, 0x200, v23
 ; VI-NEXT:    v_or_b32_e32 v31, v23, v31
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_f16_sdwa v31, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v31, v26, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v26, 0x200, v26
-; VI-NEXT:    v_or_b32_e32 v32, v26, v31
-; VI-NEXT:    v_add_f16_sdwa v31, v25, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v36, v26, v31
+; VI-NEXT:    v_add_f16_sdwa v31, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
-; VI-NEXT:    v_or_b32_e32 v31, v25, v31
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_f16_sdwa v31, v28, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v35, v25, v31
+; VI-NEXT:    v_add_f16_sdwa v31, v28, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v28, 0x200, v28
 ; VI-NEXT:    v_or_b32_e32 v38, v28, v31
-; VI-NEXT:    v_add_f16_sdwa v31, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v31, v27, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v27, 0x200, v27
 ; VI-NEXT:    v_or_b32_e32 v37, v27, v31
-; VI-NEXT:    v_add_f16_sdwa v31, v30, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_sdwa v31, v30, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v30, 0x200, v30
-; VI-NEXT:    v_add_f16_sdwa v32, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_sdwa v32, v29, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v29, 0x200, v29
-; VI-NEXT:    v_or_b32_e32 v34, v30, v31
+; VI-NEXT:    v_or_b32_e32 v49, v30, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
-; VI-NEXT:    v_or_b32_e32 v33, v29, v31
-; VI-NEXT:    v_add_f16_sdwa v31, v62, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v48, v29, v31
+; VI-NEXT:    v_add_f16_sdwa v31, v61, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_f16_e32 v62, 0x200, v62
-; VI-NEXT:    v_add_f16_sdwa v32, v61, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v61, 0x200, v61
-; VI-NEXT:    v_or_b32_e32 v51, v62, v31
+; VI-NEXT:    v_add_f16_sdwa v32, v60, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; VI-NEXT:    v_add_f16_e32 v60, 0x200, v60
+; VI-NEXT:    v_or_b32_e32 v51, v61, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
-; VI-NEXT:    v_add_f16_sdwa v47, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; VI-NEXT:    v_or_b32_e32 v50, v61, v31
+; VI-NEXT:    v_or_b32_e32 v50, v60, v31
+; VI-NEXT:    v_add_f16_sdwa v31, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT:    v_add_f16_sdwa v32, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v47
+; VI-NEXT:    v_add_f16_sdwa v32, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
 ; VI-NEXT:    v_or_b32_e32 v53, v2, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
 ; VI-NEXT:    v_or_b32_e32 v52, v1, v31
-; VI-NEXT:    v_add_f16_sdwa v31, v4, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v31, v4, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; VI-NEXT:    v_add_f16_e32 v4, 0x200, v4
-; VI-NEXT:    v_add_f16_sdwa v32, v3, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_sdwa v32, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v3, 0x200, v3
 ; VI-NEXT:    v_or_b32_e32 v46, v4, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
-; VI-NEXT:    v_add_f16_sdwa v36, v6, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v45, v3, v31
+; VI-NEXT:    v_add_f16_sdwa v31, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
-; VI-NEXT:    v_add_f16_sdwa v32, v5, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v36
+; VI-NEXT:    v_add_f16_sdwa v32, v5, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; VI-NEXT:    v_add_f16_e32 v5, 0x200, v5
 ; VI-NEXT:    v_or_b32_e32 v43, v6, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
-; VI-NEXT:    v_add_f16_sdwa v44, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_f16_sdwa v44, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v42, v5, v31
 ; VI-NEXT:    v_add_f16_e32 v8, 0x200, v8
-; VI-NEXT:    v_add_f16_sdwa v32, v7, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_sdwa v32, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v44
 ; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
 ; VI-NEXT:    v_or_b32_e32 v41, v8, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
-; VI-NEXT:    v_add_f16_sdwa v49, v10, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v40, v7, v31
+; VI-NEXT:    v_add_f16_sdwa v31, v10, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; VI-NEXT:    v_add_f16_e32 v10, 0x200, v10
-; VI-NEXT:    v_add_f16_sdwa v35, v9, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v49
-; VI-NEXT:    v_add_f16_sdwa v39, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; VI-NEXT:    v_or_b32_e32 v32, v10, v31
-; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v35
+; VI-NEXT:    v_add_f16_sdwa v32, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; VI-NEXT:    v_add_f16_e32 v9, 0x200, v9
+; VI-NEXT:    v_or_b32_e32 v55, v10, v31
+; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
+; VI-NEXT:    v_add_f16_sdwa v39, v12, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v54, v9, v31
 ; VI-NEXT:    v_add_f16_e32 v12, 0x200, v12
-; VI-NEXT:    v_add_f16_sdwa v35, v11, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v54, 16, v39
-; VI-NEXT:    v_add_f16_e32 v11, 0x200, v11
-; VI-NEXT:    v_or_b32_e32 v57, v12, v54
-; VI-NEXT:    v_lshlrev_b32_e32 v54, 16, v35
-; VI-NEXT:    v_add_f16_sdwa v63, v14, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v56, v11, v54
+; VI-NEXT:    v_add_f16_sdwa v33, v11, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v39
+; VI-NEXT:    v_add_f16_sdwa v47, v14, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v32, v12, v31
+; VI-NEXT:    v_lshlrev_b32_e32 v31, 16, v33
 ; VI-NEXT:    v_add_f16_e32 v14, 0x200, v14
-; VI-NEXT:    v_add_f16_sdwa v48, v13, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v54, 16, v63
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_f16_e32 v13, 0x200, v13
-; VI-NEXT:    v_or_b32_e32 v59, v14, v54
-; VI-NEXT:    v_lshlrev_b32_e32 v54, 16, v48
-; VI-NEXT:    v_add_f16_sdwa v48, v16, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v58, v13, v54
+; VI-NEXT:    v_add_f16_sdwa v33, v13, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v56, 16, v47
+; VI-NEXT:    v_or_b32_e32 v57, v14, v56
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v56, 16, v33
+; VI-NEXT:    v_add_f16_sdwa v33, v16, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v16, 0x200, v16
-; VI-NEXT:    v_add_f16_sdwa v60, v15, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v54, 16, v48
+; VI-NEXT:    v_add_f16_sdwa v63, v15, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v58, 16, v33
 ; VI-NEXT:    v_add_f16_e32 v15, 0x200, v15
-; VI-NEXT:    v_or_b32_e32 v55, v16, v54
-; VI-NEXT:    v_lshlrev_b32_e32 v54, 16, v60
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_or_b32_e32 v54, v15, v54
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v55
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v54
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[54:55]
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v59
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v58
+; VI-NEXT:    v_or_b32_e32 v59, v16, v58
+; VI-NEXT:    v_lshlrev_b32_e32 v58, 16, v63
+; VI-NEXT:    v_or_b32_e32 v58, v15, v58
+; VI-NEXT:    v_lshrrev_b32_e32 v62, 8, v59
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v62, 8, v58
 ; VI-NEXT:    v_lshrrev_b64 v[58:59], 24, v[58:59]
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v57
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v56
+; VI-NEXT:    v_add_f16_e32 v13, 0x200, v13
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v56, v13, v56
+; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v57
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v56
 ; VI-NEXT:    v_lshrrev_b64 v[56:57], 24, v[56:57]
-; VI-NEXT:    v_add_f16_e32 v9, 0x200, v9
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_or_b32_e32 v31, v9, v31
+; VI-NEXT:    v_add_f16_e32 v11, 0x200, v11
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v31, v11, v31
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v32
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v31
 ; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[31:32]
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v55
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v54
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[54:55]
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v41
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v40
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[40:41]
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v43
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v42
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v46
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v45
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v53
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v52
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[52:53]
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v51
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v50
-; VI-NEXT:    v_lshrrev_b64 v[50:51], 24, v[50:51]
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v51
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v50
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[50:51]
 ; VI-NEXT:    v_lshrrev_b64 v[42:43], 24, v[42:43]
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v34
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b64 v[56:57], 24, v[45:46]
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v33
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[32:33], 24, v[33:34]
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v38
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v37
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[32:33], 24, v[37:38]
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v57, v35
-; VI-NEXT:    v_mov_b32_e32 v45, v44
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v34
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v33
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[32:33], 24, v[33:34]
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b64 v[45:46], 24, v[45:46]
+; VI-NEXT:    v_lshrrev_b64 v[52:53], 24, v[52:53]
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v49
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v48
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[48:49]
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v38
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v37
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[37:38]
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v36
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v35
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[35:36]
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v54, v39
+; VI-NEXT:    v_mov_b32_e32 v37, v44
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_mov_b32_e32 v56, v58
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v34
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v33
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[33:34]
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v41, v39
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v49
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v48
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[48:49]
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v48, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v34
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v33
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[52:53], 24, v[33:34]
-; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v51
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v50
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[50:51]
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v34
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v33
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[33:34]
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v51
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v50
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[50:51]
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v51
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v50
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v32, v48, 8, 8
-; VI-NEXT:    v_mov_b32_e32 v48, v63
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v32, v48, 8, 8
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v32, v33, 8, 8
+; VI-NEXT:    v_mov_b32_e32 v33, v47
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v32, v33, 8, 8
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v32, v39, 8, 8
+; VI-NEXT:    v_mov_b32_e32 v39, v63
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[50:51], 24, v[50:51]
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v32, v41, 8, 8
-; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v32, v49, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_bfe_u32 v32, v63, 8, 8
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v32, v44, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v32, v36, 8, 8
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_bfe_u32 v32, v47, 8, 8
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v44, v32
+; VI-NEXT:    v_bfe_u32 v32, v32, 8, 8
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v49, v47
-; VI-NEXT:    v_mov_b32_e32 v44, v36
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v37, v32
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v55, v32
 ; VI-NEXT:    v_bfe_u32 v32, v32, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v32, v47, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v32, v51, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v32, v36, 8, 8
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_bfe_u32 v32, v32, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bfe_u32 v32, v32, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v32, v58, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v32, v55, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v32, v57, 8, 8
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v32, v59, 8, 8
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v32, v34, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v32, v46, 8, 8
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v54, v32
-; VI-NEXT:    v_bfe_u32 v59, v32, 8, 8
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v32, v53, 8, 8
+; VI-NEXT:    v_mov_b32_e32 v58, v57
+; VI-NEXT:    v_mov_b32_e32 v57, v59
+; VI-NEXT:    v_mov_b32_e32 v59, v34
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v32, v41, 8, 8
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB94_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v31, 8, v31
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v32, 8, v32
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v34, 8, v32
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v32, 8, v32
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v34, 8, v32
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v34, 8, v52
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v34, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v56
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v45
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v42
-; VI-NEXT:    v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 52, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 56, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 60, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v50
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v59
-; VI-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v33
-; VI-NEXT:    v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v31
+; VI-NEXT:    v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v52
-; VI-NEXT:    v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v40
+; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v40
-; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v49
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v35
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -193630,10 +193562,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
@@ -193644,23 +193576,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x6c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
@@ -193671,8 +193601,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -193684,27 +193614,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x74, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -195987,11 +195917,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32
@@ -196013,489 +195939,459 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72
-; SI-NEXT:    v_mov_b32_e32 v33, v28
-; SI-NEXT:    v_mov_b32_e32 v28, v27
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v11
+; SI-NEXT:    v_mov_b32_e32 v59, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v42, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v14
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v30
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, s27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, s28
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v34
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v51
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v52
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v53
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v55
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v40
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v40
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v44
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s17
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v45
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v47
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, s21
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v45
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v46
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v57
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, s27
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v47
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v57
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB95_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v47, v13, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
+; SI-NEXT:    v_or_b32_e32 v13, v13, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v44
 ; SI-NEXT:    v_or_b32_e32 v55, v12, v11
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v17
-; SI-NEXT:    v_or_b32_e32 v13, v16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
-; SI-NEXT:    v_or_b32_e32 v46, v14, v11
+; SI-NEXT:    v_or_b32_e32 v57, v16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
+; SI-NEXT:    v_or_b32_e32 v17, v14, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v22
-; SI-NEXT:    v_or_b32_e32 v17, v21, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v48
+; SI-NEXT:    v_or_b32_e32 v21, v21, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
 ; SI-NEXT:    v_or_b32_e32 v16, v19, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v24
-; SI-NEXT:    v_mov_b32_e32 v24, v32
-; SI-NEXT:    v_or_b32_e32 v22, v23, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v24
-; SI-NEXT:    v_or_b32_e32 v14, v60, v11
+; SI-NEXT:    v_or_b32_e32 v19, v23, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
+; SI-NEXT:    v_or_b32_e32 v47, v60, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT:    v_mov_b32_e32 v21, v6
-; SI-NEXT:    v_or_b32_e32 v23, v42, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
-; SI-NEXT:    v_or_b32_e32 v60, v63, v11
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_or_b32_e32 v43, v42, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v33
+; SI-NEXT:    v_or_b32_e32 v14, v63, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v61
-; SI-NEXT:    v_or_b32_e32 v43, v3, v11
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v32, v31
+; SI-NEXT:    v_or_b32_e32 v42, v58, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v32
-; SI-NEXT:    v_mov_b32_e32 v31, v10
-; SI-NEXT:    v_mov_b32_e32 v10, v9
-; SI-NEXT:    v_mov_b32_e32 v9, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v6, v3, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; SI-NEXT:    v_or_b32_e32 v12, v59, v11
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v60, v12, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; SI-NEXT:    v_or_b32_e32 v22, v2, v11
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v31
-; SI-NEXT:    v_or_b32_e32 v63, v5, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v58
-; SI-NEXT:    v_or_b32_e32 v42, v15, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v52
+; SI-NEXT:    v_or_b32_e32 v12, v46, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v24, v2, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_or_b32_e32 v4, v4, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; SI-NEXT:    v_or_b32_e32 v34, v34, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
-; SI-NEXT:    v_or_b32_e32 v5, v56, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; SI-NEXT:    v_or_b32_e32 v3, v59, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; SI-NEXT:    v_or_b32_e32 v56, v62, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v27
-; SI-NEXT:    v_or_b32_e32 v3, v30, v11
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_or_b32_e32 v59, v56, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; SI-NEXT:    v_or_b32_e32 v6, v62, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; SI-NEXT:    v_or_b32_e32 v62, v25, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v26
+; SI-NEXT:    v_or_b32_e32 v2, v27, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; SI-NEXT:    v_or_b32_e32 v58, v25, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v33
-; SI-NEXT:    v_or_b32_e32 v26, v26, v11
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v33, v32
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
 ; SI-NEXT:    v_or_b32_e32 v25, v28, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v35
-; SI-NEXT:    v_or_b32_e32 v29, v36, v11
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v62
-; SI-NEXT:    v_or_b32_e32 v27, v38, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v39
-; SI-NEXT:    v_or_b32_e32 v19, v37, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
-; SI-NEXT:    v_or_b32_e32 v7, v50, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v49
-; SI-NEXT:    v_mov_b32_e32 v36, v5
-; SI-NEXT:    v_or_b32_e32 v5, v52, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; SI-NEXT:    v_or_b32_e32 v39, v51, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v53
-; SI-NEXT:    v_mov_b32_e32 v38, v3
-; SI-NEXT:    v_or_b32_e32 v3, v54, v11
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_mov_b32_e32 v54, v7
-; SI-NEXT:    v_alignbit_b32 v7, v55, v47, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
+; SI-NEXT:    v_or_b32_e32 v1, v36, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; SI-NEXT:    v_or_b32_e32 v23, v35, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v38
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v18, v39, v11
+; SI-NEXT:    v_mov_b32_e32 v36, v2
+; SI-NEXT:    v_mov_b32_e32 v35, v1
+; SI-NEXT:    v_alignbit_b32 v1, v55, v13, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v55, v47, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v55, v13, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v55, v47, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v55, v13, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v46, v13, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v17, v57, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v46, v13, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v17, v57, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v46, v13, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v17, v57, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v16, v17, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v16, v21, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v16, v17, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v16, v21, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v16, v17, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v16, v21, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v14, v22, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v47, v19, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v14, v22, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v47, v19, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v14, v22, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v47, v19, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v60, v23, 24
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v14, v43, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v60, v23, 16
-; SI-NEXT:    v_or_b32_e32 v50, v44, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v43
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v14, v43, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v60, v23, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v14, v43, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v6, v40, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v60, v42, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v6, v40, 16
-; SI-NEXT:    v_mov_b32_e32 v53, v12
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v60, v42, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v6, v40, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v60, v42, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v63, v53, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v24, v22, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v63, v53, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v24, v22, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v34, v42, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v24, v22, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v34, v42, 16
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v7, v34, v42, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v34, v4, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v56, v36, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v34, v4, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v56, v36, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v34, v4, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v56, v36, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v59, v3, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v58, v38, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v59, v3, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v58, v38, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v59, v3, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 8, v34
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v38
+; SI-NEXT:    v_or_b32_e32 v61, v50, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v49
+; SI-NEXT:    v_or_b32_e32 v2, v48, v11
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v49, v6
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v58, v38, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v62, v49, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v25, v26, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v62, v49, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v25, v26, 16
-; SI-NEXT:    v_or_b32_e32 v15, v45, v11
-; SI-NEXT:    v_mov_b32_e32 v45, v27
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v62, v49, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v25, v26, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v25, v36, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v45, v29, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v25, v36, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v45, v29, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v25, v36, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v45, v29, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v23, v35, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v54, v19, 24
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v41
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v23, v35, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v54, v19, 16
-; SI-NEXT:    v_alignbit_b32 v11, v54, v19, 8
-; SI-NEXT:    v_or_b32_e32 v12, v57, v12
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v23, v35, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v7, v19
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v29, v39, v5, 24
+; SI-NEXT:    v_alignbit_b32 v1, v61, v18, 24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v11, v39, v5, 16
-; SI-NEXT:    v_mov_b32_e32 v19, v5
-; SI-NEXT:    v_alignbit_b32 v5, v39, v5, 8
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v42, v50, v3, 24
+; SI-NEXT:    v_alignbit_b32 v1, v61, v18, 16
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v48
+; SI-NEXT:    v_or_b32_e32 v58, v54, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v51
+; SI-NEXT:    v_or_b32_e32 v6, v53, v11
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v1, v61, v18, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v5, v50, v3, 16
-; SI-NEXT:    v_mov_b32_e32 v59, v3
-; SI-NEXT:    v_alignbit_b32 v57, v50, v3, 8
-; SI-NEXT:    v_alignbit_b32 v3, v12, v15, 24
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v58, v2, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v1, v58, v2, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v55
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v55
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v46
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v17
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v16
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v14
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v47
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v60
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v14
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v6
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v60
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v63
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v24
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v34
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v59
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v56
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v62
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v58
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v25
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v25
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v37
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v45
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v61
+; SI-NEXT:    v_or_b32_e32 v54, v40, v11
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v54
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v58
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v39
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v54
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v50
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v12
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v44, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v20, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v20, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v18, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v15, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v48, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v29, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v24, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v33, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v21, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v32, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v32, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v31, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v31, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v10, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v10, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v9, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v9, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v8, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v8, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v5, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v30, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v38, 8, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v62, 8, 8
-; SI-NEXT:    v_mov_b32_e32 v44, v1
-; SI-NEXT:    v_bfe_u32 v1, v1, 8, 8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v1, v48, 8, 8
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v41
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v3, v4, 8, 8
+; SI-NEXT:    v_bfe_u32 v1, v37, 8, 8
+; SI-NEXT:    v_or_b32_e32 v11, v45, v11
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v41, 8, 8
-; SI-NEXT:    v_alignbit_b32 v27, v63, v53, 16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v26, v12, v15, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v15
-; SI-NEXT:    v_alignbit_b32 v43, v12, v15, 8
-; SI-NEXT:    v_mov_b32_e32 v52, v20
-; SI-NEXT:    v_mov_b32_e32 v20, v18
-; SI-NEXT:    v_mov_b32_e32 v18, v48
-; SI-NEXT:    v_mov_b32_e32 v15, v24
-; SI-NEXT:    v_mov_b32_e32 v28, v21
-; SI-NEXT:    v_mov_b32_e32 v21, v23
-; SI-NEXT:    v_mov_b32_e32 v23, v22
-; SI-NEXT:    v_mov_b32_e32 v22, v17
-; SI-NEXT:    v_mov_b32_e32 v17, v13
-; SI-NEXT:    v_mov_b32_e32 v13, v47
+; SI-NEXT:    v_bfe_u32 v1, v52, 8, 8
+; SI-NEXT:    v_alignbit_b32 v28, v58, v2, 24
+; SI-NEXT:    v_alignbit_b32 v2, v54, v6, 24
+; SI-NEXT:    v_alignbit_b32 v39, v54, v6, 16
+; SI-NEXT:    v_alignbit_b32 v40, v54, v6, 8
+; SI-NEXT:    v_alignbit_b32 v27, v12, v11, 24
+; SI-NEXT:    v_alignbit_b32 v56, v12, v11, 16
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v30, v12, v11, 8
+; SI-NEXT:    v_mov_b32_e32 v20, v29
+; SI-NEXT:    v_mov_b32_e32 v15, v33
+; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_mov_b32_e32 v32, v31
 ; SI-NEXT:    v_mov_b32_e32 v31, v10
 ; SI-NEXT:    v_mov_b32_e32 v10, v9
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v48, v30
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v3, v2
-; SI-NEXT:    v_bfe_u32 v30, v2, 8, 8
+; SI-NEXT:    v_mov_b32_e32 v9, v7
+; SI-NEXT:    v_bfe_u32 v29, v7, 8, 8
+; SI-NEXT:    v_mov_b32_e32 v7, v8
+; SI-NEXT:    v_mov_b32_e32 v8, v5
+; SI-NEXT:    v_mov_b32_e32 v44, v37
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v37, v62
 ; SI-NEXT:    s_branch .LBB95_3
 ; SI-NEXT:  .LBB95_2:
-; SI-NEXT:    v_mov_b32_e32 v52, v20
-; SI-NEXT:    v_mov_b32_e32 v20, v18
-; SI-NEXT:    v_mov_b32_e32 v18, v48
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_mov_b32_e32 v44, v1
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -196641,390 +196537,391 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v15, v32
-; SI-NEXT:    v_mov_b32_e32 v28, v6
-; SI-NEXT:    v_mov_b32_e32 v33, v31
-; SI-NEXT:    v_mov_b32_e32 v32, v10
-; SI-NEXT:    v_mov_b32_e32 v31, v9
-; SI-NEXT:    v_mov_b32_e32 v10, v8
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v2
+; SI-NEXT:    v_mov_b32_e32 v20, v29
+; SI-NEXT:    v_mov_b32_e32 v15, v33
+; SI-NEXT:    v_mov_b32_e32 v33, v32
+; SI-NEXT:    v_mov_b32_e32 v32, v31
+; SI-NEXT:    v_mov_b32_e32 v31, v10
+; SI-NEXT:    v_mov_b32_e32 v10, v9
+; SI-NEXT:    v_mov_b32_e32 v9, v7
+; SI-NEXT:    v_mov_b32_e32 v7, v8
+; SI-NEXT:    v_mov_b32_e32 v8, v5
+; SI-NEXT:    v_mov_b32_e32 v44, v37
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:  .LBB95_3: ; %Flow
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, v44
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_mov_b32_e32 v47, v48
-; SI-NEXT:    v_mov_b32_e32 v62, v51
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v5, v8
+; SI-NEXT:    v_mov_b32_e32 v6, v7
+; SI-NEXT:    v_mov_b32_e32 v7, v9
+; SI-NEXT:    v_mov_b32_e32 v8, v10
 ; SI-NEXT:    v_mov_b32_e32 v9, v31
 ; SI-NEXT:    v_mov_b32_e32 v31, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v15
-; SI-NEXT:    v_mov_b32_e32 v15, v18
-; SI-NEXT:    v_mov_b32_e32 v18, v20
-; SI-NEXT:    v_mov_b32_e32 v20, v52
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v3, v41
-; SI-NEXT:    v_mov_b32_e32 v35, v30
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v44, v15
+; SI-NEXT:    v_mov_b32_e32 v33, v20
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v50, v2
+; SI-NEXT:    v_mov_b32_e32 v53, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v28
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_mov_b32_e32 v2, v48
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v4, v8
-; SI-NEXT:    v_mov_b32_e32 v8, v10
-; SI-NEXT:    v_mov_b32_e32 v10, v32
-; SI-NEXT:    v_mov_b32_e32 v32, v28
-; SI-NEXT:    v_mov_b32_e32 v28, v43
-; SI-NEXT:    v_mov_b32_e32 v43, v42
-; SI-NEXT:    v_mov_b32_e32 v42, v5
-; SI-NEXT:    v_mov_b32_e32 v24, v29
-; SI-NEXT:    v_mov_b32_e32 v29, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v19
-; SI-NEXT:    v_mov_b32_e32 v19, v7
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_mov_b32_e32 v5, v37
+; SI-NEXT:    v_mov_b32_e32 v11, v27
+; SI-NEXT:    v_mov_b32_e32 v38, v30
+; SI-NEXT:    v_mov_b32_e32 v27, v52
+; SI-NEXT:    v_mov_b32_e32 v30, v29
+; SI-NEXT:    v_mov_b32_e32 v29, v26
 ; SI-NEXT:    s_cbranch_vccnz .LBB95_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v29
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v47
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v6
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v49
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v41
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
 ; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
+; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
 ; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v27
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v34
+; SI-NEXT:    v_or_b32_e32 v56, v12, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v4
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_or_b32_e32 v59, v14, v13
+; SI-NEXT:    v_or_b32_e32 v36, v14, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_or_b32_e32 v55, v14, v16
+; SI-NEXT:    v_or_b32_e32 v54, v14, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_or_b32_e32 v1, v17, v16
+; SI-NEXT:    v_or_b32_e32 v52, v17, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_or_b32_e32 v17, v17, v19
+; SI-NEXT:    v_or_b32_e32 v58, v17, v19
+; SI-NEXT:    v_alignbit_b32 v40, v58, v52, 24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_or_b32_e32 v19, v21, v19
+; SI-NEXT:    v_or_b32_e32 v11, v21, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v13
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_or_b32_e32 v54, v21, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v61, v21, v22
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_or_b32_e32 v36, v23, v22
+; SI-NEXT:    v_or_b32_e32 v16, v23, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_or_b32_e32 v45, v23, v25
+; SI-NEXT:    v_or_b32_e32 v23, v23, v25
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_or_b32_e32 v13, v25, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v47
+; SI-NEXT:    v_or_b32_e32 v48, v25, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v3
 ; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; SI-NEXT:    v_or_b32_e32 v25, v26, v25
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_or_b32_e32 v14, v26, v27
+; SI-NEXT:    v_or_b32_e32 v53, v26, v27
 ; SI-NEXT:    v_mov_b32_e32 v26, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v16
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v4
 ; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_or_b32_e32 v58, v28, v27
+; SI-NEXT:    v_or_b32_e32 v62, v28, v27
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v27, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v8
 ; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v63
 ; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_or_b32_e32 v56, v29, v34
+; SI-NEXT:    v_or_b32_e32 v59, v29, v34
 ; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v35
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v29, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v51
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    v_mov_b32_e32 v29, v16
-; SI-NEXT:    v_mov_b32_e32 v62, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v7
+; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_or_b32_e32 v7, v30, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v52
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v13
-; SI-NEXT:    v_mov_b32_e32 v28, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v48
+; SI-NEXT:    v_or_b32_e32 v3, v30, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v44
 ; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT:    v_or_b32_e32 v41, v34, v30
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
+; SI-NEXT:    v_or_b32_e32 v4, v34, v30
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v32
+; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT:    v_mov_b32_e32 v30, v10
+; SI-NEXT:    v_mov_b32_e32 v32, v30
 ; SI-NEXT:    v_or_b32_e32 v34, v35, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v61
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v46
+; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; SI-NEXT:    v_or_b32_e32 v53, v35, v36
+; SI-NEXT:    v_or_b32_e32 v22, v35, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
-; SI-NEXT:    v_or_b32_e32 v63, v37, v36
-; SI-NEXT:    v_mov_b32_e32 v36, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
-; SI-NEXT:    v_or_b32_e32 v40, v39, v37
-; SI-NEXT:    v_mov_b32_e32 v38, v14
-; SI-NEXT:    v_alignbit_b32 v27, v63, v53, 16
-; SI-NEXT:    v_alignbit_b32 v30, v63, v53, 8
-; SI-NEXT:    v_bfe_u32 v35, v2, 8, 8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v35, v16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v7
+; SI-NEXT:    v_or_b32_e32 v24, v37, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
+; SI-NEXT:    v_or_b32_e32 v42, v39, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v45
+; SI-NEXT:    v_mov_b32_e32 v36, v48
 ; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v31
-; SI-NEXT:    v_or_b32_e32 v6, v37, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v49
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_alignbit_b32 v24, v39, v1, 24
-; SI-NEXT:    v_or_b32_e32 v21, v48, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v32
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v13
 ; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v60, v37, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
+; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v49
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT:    v_alignbit_b32 v39, v54, v29, 16
+; SI-NEXT:    v_or_b32_e32 v43, v48, v37
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v44
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
 ; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    v_or_b32_e32 v60, v49, v48
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
+; SI-NEXT:    v_or_b32_e32 v14, v49, v48
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v28, v14, v43, 8
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -197033,20 +196930,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    v_or_b32_e32 v23, v48, v37
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v19, v48, v37
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; SI-NEXT:    v_or_b32_e32 v14, v49, v37
+; SI-NEXT:    v_or_b32_e32 v47, v49, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
 ; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; SI-NEXT:    v_or_b32_e32 v22, v50, v37
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v21, v50, v37
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -197056,27 +196953,26 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_or_b32_e32 v16, v37, v49
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
 ; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v50
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_mov_b32_e32 v50, v55
-; SI-NEXT:    v_alignbit_b32 v43, v50, v59, 24
-; SI-NEXT:    v_alignbit_b32 v42, v50, v59, 16
-; SI-NEXT:    v_or_b32_e32 v17, v48, v37
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v50, v54, v29, 24
+; SI-NEXT:    v_or_b32_e32 v57, v48, v37
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v18
-; SI-NEXT:    v_alignbit_b32 v57, v50, v59, 8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
 ; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    v_or_b32_e32 v46, v49, v48
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v17, v49, v48
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
+; SI-NEXT:    v_mov_b32_e32 v49, v53
+; SI-NEXT:    v_alignbit_b32 v53, v54, v29, 8
 ; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
@@ -197087,575 +196983,573 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v13, v48, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v20
 ; SI-NEXT:    v_or_b32_e32 v55, v51, v37
-; SI-NEXT:    v_alignbit_b32 v7, v55, v13, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v55, v13, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v55, v13, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v55, v13, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v55, v13, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v17, v57, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v46, v17, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v17, v57, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v46, v17, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v17, v57, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v46, v17, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v16, v21, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v16, v22, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v16, v21, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v16, v22, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v16, v21, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v16, v22, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v47, v19, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v14, v23, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v47, v19, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v14, v23, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v47, v19, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v14, v23, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v14, v43, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v60, v21, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v14, v43, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v60, v21, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v60, v42, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v60, v21, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v60, v42, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v6, v40, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v60, v42, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v6, v40, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v24, v22, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v6, v40, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v24, v22, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v63, v53, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v24, v22, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v34, v41, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v34, v4, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v34, v41, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v34, v4, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v34, v41, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v34, v4, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v56, v36, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v59, v3, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v56, v36, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v59, v3, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v56, v36, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v59, v3, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v58, v38, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v62, v49, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v58, v38, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v62, v49, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v58, v38, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v62, v49, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v25, v26, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v25, v36, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v25, v26, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v7, v25, v26, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v45, v28, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v45, v28, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v7, v45, v28, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v25, v36, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v54, v19, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v25, v36, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v54, v19, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v23, v35, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v54, v19, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v23, v35, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v39, v1, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v23, v35, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v7, v39, v1, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v61, v11, 24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v55
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v61, v11, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v10, v61, v11, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v46
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v10, v58, v52, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v10, v58, v52, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v10, v56
+; SI-NEXT:    v_alignbit_b32 v11, v12, v10, 24
+; SI-NEXT:    v_alignbit_b32 v56, v12, v10, 16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v38, v12, v10, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v14
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v55
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v60
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v17
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v6
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v63
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v47
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v34
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v14
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v56
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v60
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v58
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v24
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v25
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v59
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v45
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v62
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v54
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v25
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v39
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v23
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v50
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v61
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v12
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v58
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v20, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v54
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v18, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v12
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v15, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v20, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v33, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v18, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v32, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v15, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v31, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v33, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v10, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v44, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v9, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v31, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v8, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v30, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v29, 8, 8
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v9, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v47, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v8, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v5, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v6, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v4, 8, 8
-; SI-NEXT:    v_alignbit_b32 v26, v12, v11, 24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v5, 8, 8
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_bfe_u32 v10, v26, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v26, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v28, v12, v11, 8
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_bfe_u32 v7, v29, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v2, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v7, v3, 8, 8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v10, v1, 8, 8
+; SI-NEXT:    v_alignbit_b32 v48, v55, v13, 24
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v34
+; SI-NEXT:    v_bfe_u32 v30, v7, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_bfe_u32 v10, v27, 8, 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB95_5: ; %end
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_and_b32_e32 v37, 0xff, v13
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v52, 24, v48
 ; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_and_b32_e32 v15, 0xff, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v51, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v37, v37, v51
 ; SI-NEXT:    v_and_b32_e32 v37, 0xffff, v37
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v51, 0xff, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v51, 0xff, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 24, v7
 ; SI-NEXT:    v_or_b32_e32 v51, v52, v51
 ; SI-NEXT:    v_or_b32_e32 v37, v37, v51
 ; SI-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v37, 0xff, v55
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v51, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v37, v37, v51
 ; SI-NEXT:    v_and_b32_e32 v37, 0xffff, v37
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v20, v48, v20
 ; SI-NEXT:    v_or_b32_e32 v20, v37, v20
 ; SI-NEXT:    v_add_i32_e32 v37, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v20, v37, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v17
+; SI-NEXT:    v_and_b32_e32 v20, 0xff, v57
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v20, v20, v37
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v37, 0xff, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v37, 0xff, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v37, v48, v37
 ; SI-NEXT:    v_or_b32_e32 v20, v20, v37
 ; SI-NEXT:    v_add_i32_e32 v37, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v20, v37, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v46
+; SI-NEXT:    v_and_b32_e32 v20, 0xff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v20, v20, v37
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v18, v37, v18
 ; SI-NEXT:    v_or_b32_e32 v18, v20, v18
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v18, v20, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v22
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v21
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v18, v18, v20
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v20, 0xff, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v20, v37, v20
 ; SI-NEXT:    v_or_b32_e32 v18, v18, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v18, v20, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v18, v18, v20
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v15, v20, v15
 ; SI-NEXT:    v_or_b32_e32 v15, v18, v15
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v23
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v18, v20, v18
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v14
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v47
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v18, v20, v18
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v21
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v43
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v28
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v18, v20, v18
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v60
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v32
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v18, v20, v18
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v40
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v42
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v18, v20, v18
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v60
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v18, v20, v18
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v30
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v7
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
 ; SI-NEXT:    v_or_b32_e32 v18, v20, v18
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v63
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v24
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v32
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v13
 ; SI-NEXT:    v_or_b32_e32 v10, v18, v10
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    v_or_b32_e32 v10, v15, v10
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v10, v15, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v41
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v10, v10, v15
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v13
 ; SI-NEXT:    v_or_b32_e32 v15, v18, v15
 ; SI-NEXT:    v_or_b32_e32 v10, v10, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v10, v15, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xff, v34
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v29
 ; SI-NEXT:    v_or_b32_e32 v10, v10, v15
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v13
 ; SI-NEXT:    v_or_b32_e32 v9, v15, v9
 ; SI-NEXT:    v_or_b32_e32 v9, v10, v9
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v3
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v7
 ; SI-NEXT:    v_or_b32_e32 v10, v15, v10
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v10
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v56
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v59
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v10
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v3
 ; SI-NEXT:    v_or_b32_e32 v8, v10, v8
 ; SI-NEXT:    v_or_b32_e32 v8, v9, v8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v38
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v49
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v4
 ; SI-NEXT:    v_or_b32_e32 v9, v10, v9
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v58
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v62
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v62
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v30
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
 ; SI-NEXT:    v_or_b32_e32 v7, v9, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v36
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
@@ -197668,26 +197562,24 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v25
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v3
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v8, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v35
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v7
@@ -197702,24 +197594,25 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v45
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v23
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v3
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; SI-NEXT:    v_or_b32_e32 v5, v7, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v19
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v6
@@ -197734,13 +197627,17 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v54
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v61
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
@@ -197748,12 +197645,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v24
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v40
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
@@ -197768,23 +197664,27 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v39
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v58
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v35
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
 ; SI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v4, v2
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v59
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v57
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v50
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v42
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v43
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -197792,7 +197692,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v50
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v54
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
@@ -197804,33 +197704,33 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v28
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v26
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v4, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v12
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -197857,8 +197757,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VI-NEXT:    v_writelane_b32 v63, s30, 0
 ; VI-NEXT:    v_writelane_b32 v63, s31, 1
@@ -198088,117 +197988,112 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    s_lshr_b32 s46, s45, 16
 ; VI-NEXT:    v_mov_b32_e32 v7, 0x200
 ; VI-NEXT:    v_add_f16_e32 v1, s46, v7
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v2, s45, v7
 ; VI-NEXT:    s_lshr_b32 s45, s44, 16
 ; VI-NEXT:    v_or_b32_e32 v23, v2, v1
 ; VI-NEXT:    v_add_f16_e32 v1, s45, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v2, s44, v7
 ; VI-NEXT:    s_lshr_b32 s44, s43, 16
 ; VI-NEXT:    v_or_b32_e32 v22, v2, v1
 ; VI-NEXT:    v_add_f16_e32 v1, s44, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v2, s43, v7
 ; VI-NEXT:    s_lshr_b32 s43, s42, 16
 ; VI-NEXT:    v_or_b32_e32 v25, v2, v1
 ; VI-NEXT:    v_add_f16_e32 v1, s43, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v2, s42, v7
 ; VI-NEXT:    s_lshr_b32 s42, s41, 16
 ; VI-NEXT:    v_or_b32_e32 v24, v2, v1
 ; VI-NEXT:    v_add_f16_e32 v1, s42, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v2, s41, v7
 ; VI-NEXT:    s_lshr_b32 s41, s40, 16
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v27, v2, v1
 ; VI-NEXT:    v_add_f16_e32 v1, s41, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v2, s40, v7
 ; VI-NEXT:    s_lshr_b32 s40, s15, 16
-; VI-NEXT:    v_or_b32_e32 v26, v2, v1
-; VI-NEXT:    v_add_f16_e32 v1, s40, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_f16_e32 v53, s40, v7
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v26, v2, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v53
 ; VI-NEXT:    v_add_f16_e32 v2, s15, v7
 ; VI-NEXT:    s_lshr_b32 s15, s14, 16
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v29, v2, v1
 ; VI-NEXT:    v_add_f16_e32 v1, s15, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v2, s14, v7
 ; VI-NEXT:    s_lshr_b32 s14, s13, 16
-; VI-NEXT:    v_or_b32_e32 v28, v2, v1
-; VI-NEXT:    v_add_f16_e32 v1, s14, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_f16_e32 v43, s14, v7
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v28, v2, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v43
 ; VI-NEXT:    v_add_f16_e32 v2, s13, v7
 ; VI-NEXT:    s_lshr_b32 s13, s12, 16
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v6, v2, v1
 ; VI-NEXT:    v_add_f16_e32 v1, s13, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v2, s12, v7
 ; VI-NEXT:    s_lshr_b32 s12, s11, 16
-; VI-NEXT:    v_or_b32_e32 v5, v2, v1
-; VI-NEXT:    v_add_f16_e32 v1, s12, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_f16_e32 v37, s12, v7
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v5, v2, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v37
 ; VI-NEXT:    v_add_f16_e32 v2, s11, v7
 ; VI-NEXT:    s_lshr_b32 s11, s10, 16
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v31, v2, v1
 ; VI-NEXT:    v_add_f16_e32 v1, s11, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v2, s10, v7
 ; VI-NEXT:    s_lshr_b32 s10, s9, 16
-; VI-NEXT:    v_or_b32_e32 v30, v2, v1
-; VI-NEXT:    v_add_f16_e32 v1, s10, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_f16_e32 v52, s10, v7
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v30, v2, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v52
 ; VI-NEXT:    v_add_f16_e32 v2, s9, v7
 ; VI-NEXT:    s_lshr_b32 s9, s8, 16
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v4, v2, v1
 ; VI-NEXT:    v_add_f16_e32 v1, s9, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v2, s8, v7
 ; VI-NEXT:    s_lshr_b32 s8, s7, 16
-; VI-NEXT:    v_or_b32_e32 v3, v2, v1
-; VI-NEXT:    v_add_f16_e32 v1, s8, v7
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_add_f16_e32 v50, s8, v7
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v3, v2, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v50
 ; VI-NEXT:    v_add_f16_e32 v2, s7, v7
 ; VI-NEXT:    s_lshr_b32 s7, s6, 16
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v1
 ; VI-NEXT:    v_add_f16_e32 v1, s7, v7
 ; VI-NEXT:    v_add_f16_e32 v8, s6, v7
 ; VI-NEXT:    s_lshr_b32 s6, s17, 16
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_add_f16_e32 v36, s6, v7
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v1, v8, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
 ; VI-NEXT:    v_add_f16_e32 v9, s17, v7
@@ -198206,12 +198101,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_e32 v33, v9, v8
 ; VI-NEXT:    v_add_f16_e32 v8, s6, v7
 ; VI-NEXT:    s_lshr_b32 s6, s19, 16
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; VI-NEXT:    v_add_f16_e32 v9, s16, v7
 ; VI-NEXT:    v_add_f16_e32 v38, s6, v7
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v32, v9, v8
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v38
 ; VI-NEXT:    v_add_f16_e32 v9, s19, v7
@@ -198219,12 +198114,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_e32 v21, v9, v8
 ; VI-NEXT:    v_add_f16_e32 v8, s6, v7
 ; VI-NEXT:    s_lshr_b32 s6, s21, 16
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; VI-NEXT:    v_add_f16_e32 v9, s18, v7
 ; VI-NEXT:    v_add_f16_e32 v61, s6, v7
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v20, v9, v8
 ; VI-NEXT:    s_lshr_b32 s7, s20, 16
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v61
@@ -198232,12 +198127,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_e32 v35, v9, v8
 ; VI-NEXT:    v_add_f16_e32 v8, s7, v7
 ; VI-NEXT:    s_lshr_b32 s6, s23, 16
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; VI-NEXT:    v_add_f16_e32 v9, s20, v7
 ; VI-NEXT:    v_add_f16_e32 v45, s6, v7
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v34, v9, v8
 ; VI-NEXT:    s_lshr_b32 s7, s22, 16
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v45
@@ -198245,12 +198140,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_e32 v19, v9, v8
 ; VI-NEXT:    v_add_f16_e32 v8, s7, v7
 ; VI-NEXT:    s_lshr_b32 s6, s25, 16
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; VI-NEXT:    v_add_f16_e32 v9, s22, v7
 ; VI-NEXT:    v_add_f16_e32 v47, s6, v7
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v18, v9, v8
 ; VI-NEXT:    s_lshr_b32 s7, s24, 16
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v47
@@ -198258,12 +198153,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_e32 v16, v9, v8
 ; VI-NEXT:    v_add_f16_e32 v8, s7, v7
 ; VI-NEXT:    s_lshr_b32 s6, s27, 16
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; VI-NEXT:    v_add_f16_e32 v9, s24, v7
 ; VI-NEXT:    v_add_f16_e32 v57, s6, v7
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v15, v9, v8
 ; VI-NEXT:    s_lshr_b32 s7, s26, 16
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v57
@@ -198271,12 +198166,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_e32 v13, v9, v8
 ; VI-NEXT:    v_add_f16_e32 v8, s7, v7
 ; VI-NEXT:    s_lshr_b32 s6, s29, 16
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; VI-NEXT:    v_add_f16_e32 v9, s26, v7
 ; VI-NEXT:    v_add_f16_e32 v59, s6, v7
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v12, v9, v8
 ; VI-NEXT:    s_lshr_b32 s7, s28, 16
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v59
@@ -198286,80 +198181,96 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_add_f16_e32 v8, s7, v7
 ; VI-NEXT:    s_lshr_b32 s7, s4, 16
 ; VI-NEXT:    v_add_f16_e32 v51, s6, v7
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; VI-NEXT:    v_add_f16_e32 v9, s28, v7
 ; VI-NEXT:    v_add_f16_e32 v54, s5, v7
-; VI-NEXT:    v_add_f16_e32 v53, s7, v7
+; VI-NEXT:    v_add_f16_e32 v11, s7, v7
 ; VI-NEXT:    v_add_f16_e32 v55, s4, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v51
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v9, v9, v8
 ; VI-NEXT:    v_or_b32_e32 v8, v54, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v53
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v7, v55, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v7
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v7
 ; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v13
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[1:2]
-; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[3:4]
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v16
 ; VI-NEXT:    v_lshrrev_b64 v[16:17], 24, v[15:16]
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 8, v13
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[12:13]
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v18
+; VI-NEXT:    v_lshrrev_b32_e32 v44, 8, v18
 ; VI-NEXT:    v_lshrrev_b64 v[17:18], 24, v[18:19]
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v44, 8, v35
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[3:4]
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 8, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v35
 ; VI-NEXT:    v_lshrrev_b64 v[18:19], 24, v[34:35]
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v23
 ; VI-NEXT:    v_lshrrev_b64 v[14:15], 24, v[22:23]
-; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    v_bfe_u32 v23, v50, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v23, v52, 8, 8
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v31
-; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[30:31]
-; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v10
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v23, v37, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v30
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v23, v43, 8, 8
 ; VI-NEXT:    v_lshrrev_b64 v[10:11], 24, v[9:10]
-; VI-NEXT:    v_lshrrev_b32_e32 v40, 8, v20
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[20:21]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v30
+; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v23, v53, 8, 8
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v34
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v24
+; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[24:25]
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[30:31]
+; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v20
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[20:21]
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
 ; VI-NEXT:    v_lshrrev_b64 v[4:5], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v34
-; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v21
-; VI-NEXT:    v_lshrrev_b32_e32 v52, 8, v32
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v21
 ; VI-NEXT:    v_lshrrev_b64 v[20:21], 24, v[32:33]
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v29
+; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v6
 ; VI-NEXT:    v_lshrrev_b64 v[5:6], 24, v[28:29]
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v26
 ; VI-NEXT:    v_lshrrev_b64 v[8:9], 24, v[26:27]
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v24
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[24:25]
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v33
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v33
+; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v32
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v29
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
 ; VI-NEXT:    v_lshrrev_b32_e32 v28, 8, v27
+; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v26
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v25
 ; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v22
 ; VI-NEXT:    v_bfe_u32 v25, v51, 8, 8
@@ -198371,31 +198282,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_bfe_u32 v22, v38, 8, 8
 ; VI-NEXT:    v_bfe_u32 v2, v36, 8, 8
 ; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_bfe_u32 v26, v50, 8, 8
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_bfe_u32 v23, v23, 8, 8
-; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_bfe_u32 v24, v24, 8, 8
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_bfe_u32 v26, v26, 8, 8
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bfe_u32 v23, v23, 8, 8
-; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bfe_u32 v23, v23, 8, 8
-; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bfe_u32 v23, v23, 8, 8
-; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bfe_u32 v23, v23, 8, 8
-; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bfe_u32 v23, v23, 8, 8
+; VI-NEXT:    v_bfe_u32 v24, v24, 8, 8
 ; VI-NEXT:    s_branch .LBB95_5
 ; VI-NEXT:  .LBB95_3:
 ; VI-NEXT:    ; implicit-def: $sgpr46
@@ -198555,136 +198446,133 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    s_branch .LBB95_2
 ; VI-NEXT:  .LBB95_4:
 ; VI-NEXT:    v_mov_b32_e32 v1, s44
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s45
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s42
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s43
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s40
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s41
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s14
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s15
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s12
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s13
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s10
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s11
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s8
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s6
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s16
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s17
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s18
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v1, s19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, s19
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s20
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v1, s21
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, s21
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s22
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s23
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s24
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s25
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s26
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v1, s27
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, s27
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s28
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v1, s29
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, s29
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s71
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v1, s70
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s69
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s68
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s67
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s66
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s65
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s64
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s55
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v1, s54
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s87
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v1, s86
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s85
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v1, s84
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s53
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s52
 ; VI-NEXT:    v_readlane_b32 s6, v62, 0
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 1
 ; VI-NEXT:    v_mov_b32_e32 v36, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 2
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 3
 ; VI-NEXT:    v_mov_b32_e32 v38, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 4
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 5
 ; VI-NEXT:    v_mov_b32_e32 v61, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 6
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 7
 ; VI-NEXT:    v_mov_b32_e32 v45, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 8
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 9
 ; VI-NEXT:    v_mov_b32_e32 v47, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 10
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 11
-; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_readlane_b32 s4, v62, 16
 ; VI-NEXT:    v_mov_b32_e32 v57, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 12
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_readlane_b32 s6, v62, 13
+; VI-NEXT:    v_mov_b32_e32 v55, s4
+; VI-NEXT:    v_readlane_b32 s4, v62, 16
+; VI-NEXT:    v_mov_b32_e32 v59, s6
+; VI-NEXT:    v_readlane_b32 s6, v62, 14
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 17
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s6
 ; VI-NEXT:    v_mov_b32_e32 v22, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 18
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 19
 ; VI-NEXT:    v_mov_b32_e32 v15, s4
@@ -198699,74 +198587,77 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_readlane_b32 s4, v62, 24
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 25
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 26
-; VI-NEXT:    v_mov_b32_e32 v58, s4
+; VI-NEXT:    v_mov_b32_e32 v46, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 27
-; VI-NEXT:    v_mov_b32_e32 v56, s4
+; VI-NEXT:    v_mov_b32_e32 v41, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 28
-; VI-NEXT:    v_mov_b32_e32 v44, s4
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v3, s80
+; VI-NEXT:    v_mov_b32_e32 v60, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 29
-; VI-NEXT:    v_mov_b32_e32 v49, s4
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v3, s81
+; VI-NEXT:    v_mov_b32_e32 v40, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 30
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v3, s80
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v3, s82
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 31
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v3, s81
-; VI-NEXT:    v_mov_b32_e32 v37, s4
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 32
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v3, s82
-; VI-NEXT:    v_mov_b32_e32 v42, s4
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 33
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 34
-; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 35
-; VI-NEXT:    v_mov_b32_e32 v43, s4
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 36
 ; VI-NEXT:    v_mov_b32_e32 v48, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 37
-; VI-NEXT:    v_mov_b32_e32 v60, s4
+; VI-NEXT:    v_mov_b32_e32 v49, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 38
-; VI-NEXT:    v_mov_b32_e32 v46, s4
+; VI-NEXT:    v_mov_b32_e32 v44, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 39
-; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    v_mov_b32_e32 v42, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 40
-; VI-NEXT:    v_mov_b32_e32 v40, s4
+; VI-NEXT:    v_mov_b32_e32 v56, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 41
-; VI-NEXT:    v_mov_b32_e32 v52, s4
+; VI-NEXT:    v_mov_b32_e32 v58, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 42
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 43
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 44
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 45
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 46
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 47
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v3, s78
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_readlane_b32 s4, v62, 48
 ; VI-NEXT:    v_mov_b32_e32 v31, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 49
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v3, s4
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v3, s78
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v30, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 50
 ; VI-NEXT:    v_mov_b32_e32 v33, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 51
@@ -198776,20 +198667,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_readlane_b32 s4, v62, 53
 ; VI-NEXT:    v_mov_b32_e32 v28, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 54
-; VI-NEXT:    v_readlane_b32 s6, v62, 13
 ; VI-NEXT:    v_mov_b32_e32 v34, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 55
-; VI-NEXT:    v_mov_b32_e32 v59, s6
-; VI-NEXT:    v_readlane_b32 s6, v62, 14
 ; VI-NEXT:    v_mov_b32_e32 v9, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 56
 ; VI-NEXT:    v_mov_b32_e32 v3, s88
-; VI-NEXT:    v_mov_b32_e32 v53, s6
 ; VI-NEXT:    v_readlane_b32 s6, v62, 15
 ; VI-NEXT:    v_mov_b32_e32 v21, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 57
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v50, s70
+; VI-NEXT:    v_mov_b32_e32 v43, s54
+; VI-NEXT:    v_mov_b32_e32 v37, s86
+; VI-NEXT:    v_mov_b32_e32 v52, s84
 ; VI-NEXT:    v_mov_b32_e32 v51, s6
 ; VI-NEXT:    v_mov_b32_e32 v54, s5
 ; VI-NEXT:    v_mov_b32_e32 v23, s83
@@ -198811,8 +198702,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_mov_b32_e32 v11, s38
 ; VI-NEXT:    v_mov_b32_e32 v14, s48
 ; VI-NEXT:  .LBB95_5: ; %end
-; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v52, 8, v52
+; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v58, 8, v58
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198853,31 +198744,31 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v52, v30, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v58, v53, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v20, v30, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v20, v52, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v20, v53, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v20, v58, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v58
+; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v46
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v20, v52, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v20, v46, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v20, vcc, 4, v0
 ; VI-NEXT:    buffer_store_dword v2, v20, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v40
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v56
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, 8, v0
 ; VI-NEXT:    buffer_store_dword v2, v19, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v56
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v41
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v19, 8, v22
@@ -198885,36 +198776,36 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, 12, v0
 ; VI-NEXT:    buffer_store_dword v2, v19, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v50
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v42
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 16, v0
 ; VI-NEXT:    buffer_store_dword v2, v18, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v44
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v60
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v46
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v44
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v17
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v49
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v40
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v15
@@ -198922,9 +198813,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v60
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v49
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v16
@@ -198933,7 +198824,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v39
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198942,8 +198833,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v48
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -198953,8 +198844,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v37
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
@@ -198962,9 +198855,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v43
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v10
@@ -198973,8 +198868,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v42
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v27
@@ -198982,14 +198879,18 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 52, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v41
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v7
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 56, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v25
 ; VI-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -198998,8 +198899,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 60, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -199009,19 +198910,17 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v35
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v34
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -199031,8 +198930,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -199042,8 +198941,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v29
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -199053,8 +198952,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v28
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -199064,90 +198963,86 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
-; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v32
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v31
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v30
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x6c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -199155,29 +199050,28 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x74, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -199185,15 +199079,15 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -199214,8 +199108,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -199631,7 +199525,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v36
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v13
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v36
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v22
@@ -199639,7 +199533,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v35
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v22
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v38
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v22
@@ -199938,7 +199832,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 22
 ; GFX9-NEXT:    v_mov_b32_e32 v60, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 23
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v17, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 24
 ; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
@@ -199946,7 +199840,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 25
 ; GFX9-NEXT:    v_mov_b32_e32 v23, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 26
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v17, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 27
 ; GFX9-NEXT:    v_mov_b32_e32 v59, s4
@@ -200216,14 +200110,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -202656,19 +202550,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:392
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
@@ -202694,439 +202588,431 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v14
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v16
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v12
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v22
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v20
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v30
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v28
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v28
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v24
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr20
 ; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; kill: killed $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:332
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:360
+; SI-NEXT:    ; kill: killed $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:360
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v9
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v13
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v15
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v19
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v21
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v17
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v29
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v23
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v31
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v32
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:300
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 8, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v18
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v32
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:184
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:128
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v29
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v31
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:108
+; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:184
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:180
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:176
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v33
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:180
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v34
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:108
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 8, v10
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v34
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:172
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:220
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:192
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:172
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:216
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:208
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:228
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:252
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:224
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:204
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:252
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:224
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:248
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:248
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:244
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:240
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v9
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:260
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:256
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:236
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:272
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v11
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:292
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:316
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:268
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:316
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:312
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v9
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v11
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:324
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:348
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:320
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v4
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:344
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:340
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:336
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 24, v9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:356
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:332
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:380
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:352
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:352
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:376
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:372
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:368
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:364
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v16
-; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:364
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 24, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:388
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:384
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:388
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v10
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v18
+; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v11
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v17
-; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v7
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 8, v15
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v8
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:120
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:120
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; kill: killed $vgpr6
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
+; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; kill: killed $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v12
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; kill: killed $vgpr6
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 8, v15
-; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48
+; SI-NEXT:    v_lshlrev_b32_e32 v53, 8, v13
+; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v14
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; kill: killed $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -203167,26 +203053,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_or_b32_e32 v7, v1, v2
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v12, v1, v2
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v42, v2, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -203195,7 +203074,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v5, v2, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -203203,444 +203082,488 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v26, v2, v1
+; SI-NEXT:    v_or_b32_e32 v7, v16, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v11, v2, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v42, v1, v2
+; SI-NEXT:    v_or_b32_e32 v40, v1, v2
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v39, v24, v1
+; SI-NEXT:    v_or_b32_e32 v10, v24, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v17, v2, v1
+; SI-NEXT:    v_or_b32_e32 v15, v2, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v55, v1, v2
+; SI-NEXT:    v_or_b32_e32 v26, v1, v2
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v18, v2, v1
+; SI-NEXT:    v_or_b32_e32 v18, v25, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v20, v2, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v19, v2, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v28, v2, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v30, v2, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v27, v2, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v21, v2, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v29, v1, v2
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v30, v2, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v33, v2, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v53, v2, v1
+; SI-NEXT:    v_or_b32_e32 v27, v2, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v19, v1, v2
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v33, v2, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v38, v1, v2
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v43, v2, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v15, v1, v2
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v52, v1, v2
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v38, v2, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v2, v1, v2
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v45, v2, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v47, v1, v2
+; SI-NEXT:    v_or_b32_e32 v55, v1, v6
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v2, v2, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v57, v3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v59, v1, v3
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v17, v6, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v4, v3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v62, v3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v14, v1, v6
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v44, v1, v3
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v44, v1, v6
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v8, v1, v3
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v45, v6, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v21, v1, v3
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v25, v1, v6
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v47, v1, v6
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v58, v6, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v6, v3, v1
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v16, v1, v6
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v59, v1, v6
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v62, v6, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v24, v1, v6
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v3, v1, v3
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v13, v1, v6
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v16, v1, v12
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v6, v6, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v24, v12, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v23, v1, v8
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_mov_b32_e32 v1, v32
-; SI-NEXT:    v_or_b32_e32 v32, v10, v12
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_mov_b32_e32 v10, v34
-; SI-NEXT:    v_or_b32_e32 v34, v22, v12
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v36, v1, v8
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; SI-NEXT:    v_or_b32_e32 v36, v12, v35
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v8, v8, v35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_or_b32_e32 v35, v37, v12
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_or_b32_e32 v35, v37, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_or_b32_e32 v37, v51, v12
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_or_b32_e32 v37, v51, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; SI-NEXT:    v_or_b32_e32 v51, v12, v60
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_or_b32_e32 v14, v31, v12
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v51, v22, v63
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_or_b32_e32 v34, v56, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_or_b32_e32 v31, v61, v12
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v54
-; SI-NEXT:    v_or_b32_e32 v54, v12, v23
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v46
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_or_b32_e32 v23, v25, v12
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_or_b32_e32 v13, v13, v12
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_or_b32_e32 v56, v60, v22
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v54
+; SI-NEXT:    v_or_b32_e32 v54, v22, v4
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_or_b32_e32 v9, v9, v22
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v4, v32, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; SI-NEXT:    v_or_b32_e32 v12, v12, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v28, v22, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v25, v22, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v32, v22, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v40, v22, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v39, v22, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v41, v22, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v46, v22, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v57, v22, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v60, v22, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v61, v22, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v63, v22, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v1, v22, v1
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v1, v1, v45
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v1, v1, v55
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v10, v22, v10
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v48, v22, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    v_or_b32_e32 v48, v22, v48
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    v_or_b32_e32 v56, v22, v49
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT:    v_or_b32_e32 v46, v22, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v49, v22, v49
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    v_or_b32_e32 v50, v22, v50
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v49, v22, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    v_or_b32_e32 v58, v22, v58
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v50, v22, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v20, v22, v20
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    v_or_b32_e32 v9, v22, v9
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v52
-; SI-NEXT:    v_or_b32_e32 v52, v22, v63
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v53, v22, v53
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v22, v7, v63
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v12
-; SI-NEXT:    v_or_b32_e32 v12, v7, v5
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT:    v_or_b32_e32 v3, v22, v3
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v61
+; SI-NEXT:    v_mov_b32_e32 v61, v42
+; SI-NEXT:    v_or_b32_e32 v31, v22, v31
+; SI-NEXT:    v_or_b32_e32 v22, v12, v61
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v28
+; SI-NEXT:    v_or_b32_e32 v43, v12, v5
 ; SI-NEXT:    v_alignbit_b32 v5, v22, v5, 16
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v25
+; SI-NEXT:    v_mov_b32_e32 v12, v7
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v32
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v11
-; SI-NEXT:    v_mov_b32_e32 v25, v39
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff, v49
+; SI-NEXT:    v_or_b32_e32 v32, v32, v59
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_or_b32_e32 v5, v5, v26
+; SI-NEXT:    v_or_b32_e32 v5, v5, v12
 ; SI-NEXT:    v_alignbit_b32 v11, v5, v11, 16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v42
-; SI-NEXT:    v_or_b32_e32 v42, v11, v25
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v40
-; SI-NEXT:    v_or_b32_e32 v40, v11, v17
-; SI-NEXT:    v_alignbit_b32 v11, v42, v17, 16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v55
-; SI-NEXT:    v_or_b32_e32 v55, v11, v18
+; SI-NEXT:    v_or_b32_e32 v42, v11, v10
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v39
+; SI-NEXT:    v_or_b32_e32 v40, v11, v15
+; SI-NEXT:    v_alignbit_b32 v11, v42, v15, 16
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v26
+; SI-NEXT:    v_or_b32_e32 v26, v11, v18
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v41
-; SI-NEXT:    v_or_b32_e32 v39, v11, v20
-; SI-NEXT:    v_alignbit_b32 v11, v55, v20, 16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v39, v11, v19
+; SI-NEXT:    v_alignbit_b32 v11, v26, v19, 16
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v41, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT:    v_or_b32_e32 v28, v11, v41
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v46
-; SI-NEXT:    v_or_b32_e32 v20, v11, v27
-; SI-NEXT:    v_alignbit_b32 v11, v28, v27, 16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v28, v11, v30
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v57
+; SI-NEXT:    v_or_b32_e32 v11, v11, v21
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v11, v28, v21, 16
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v29
-; SI-NEXT:    v_or_b32_e32 v29, v11, v30
+; SI-NEXT:    v_or_b32_e32 v29, v11, v33
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v60
-; SI-NEXT:    v_or_b32_e32 v27, v11, v53
-; SI-NEXT:    v_alignbit_b32 v11, v29, v53, 16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v19
-; SI-NEXT:    v_or_b32_e32 v19, v11, v33
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v61
-; SI-NEXT:    v_or_b32_e32 v11, v11, v43
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v11, v19, v43, 16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v15
-; SI-NEXT:    v_or_b32_e32 v11, v11, v38
-; SI-NEXT:    v_alignbit_b32 v1, v11, v45, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v21, v11, v27
+; SI-NEXT:    v_alignbit_b32 v11, v29, v27, 16
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v47
-; SI-NEXT:    v_or_b32_e32 v15, v1, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v1, v1, v57
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr60
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    v_or_b32_e32 v19, v11, v38
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v63
+; SI-NEXT:    v_or_b32_e32 v27, v11, v52
+; SI-NEXT:    v_alignbit_b32 v11, v19, v52, 16
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v15, v57, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v23
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    v_or_b32_e32 v11, v11, v2
+; SI-NEXT:    v_alignbit_b32 v1, v11, v55, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v59
-; SI-NEXT:    v_or_b32_e32 v17, v1, v4
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v17
+; SI-NEXT:    v_or_b32_e32 v15, v1, v14
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v48
-; SI-NEXT:    v_or_b32_e32 v1, v1, v62
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v1, v1, v44
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v17, v62, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v15, v44, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v44
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v56
-; SI-NEXT:    v_or_b32_e32 v1, v1, v8
-; SI-NEXT:    v_or_b32_e32 v10, v10, v21
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v1, v21, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; SI-NEXT:    v_or_b32_e32 v62, v6, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v45
+; SI-NEXT:    v_or_b32_e32 v17, v1, v25
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v46
+; SI-NEXT:    v_or_b32_e32 v1, v1, v47
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v1, v17, v47, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v58
+; SI-NEXT:    v_or_b32_e32 v1, v1, v16
+; SI-NEXT:    v_alignbit_b32 v32, v1, v59, 16
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff, v62
+; SI-NEXT:    v_or_b32_e32 v59, v6, v23
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v20
+; SI-NEXT:    v_or_b32_e32 v62, v32, v24
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff, v50
+; SI-NEXT:    v_or_b32_e32 v50, v6, v36
+; SI-NEXT:    v_alignbit_b32 v6, v59, v36, 16
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v8
+; SI-NEXT:    v_or_b32_e32 v47, v6, v35
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v53
+; SI-NEXT:    v_or_b32_e32 v49, v6, v37
+; SI-NEXT:    v_alignbit_b32 v6, v47, v37, 16
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v51
+; SI-NEXT:    v_or_b32_e32 v45, v6, v34
+; SI-NEXT:    v_or_b32_e32 v48, v3, v56
+; SI-NEXT:    v_alignbit_b32 v3, v45, v56, 16
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v54
+; SI-NEXT:    v_or_b32_e32 v44, v3, v4
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v31
+; SI-NEXT:    v_or_b32_e32 v3, v3, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v14
+; SI-NEXT:    v_mov_b32_e32 v14, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
@@ -203824,175 +203747,133 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v49
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    v_or_b32_e32 v6, v6, v16
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v6, v62, v16, 16
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v24
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    v_or_b32_e32 v59, v6, v32
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v50
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    v_or_b32_e32 v56, v6, v34
-; SI-NEXT:    v_alignbit_b32 v6, v59, v34, 16
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v36
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    v_or_b32_e32 v47, v6, v35
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v58
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    v_or_b32_e32 v50, v6, v37
-; SI-NEXT:    v_alignbit_b32 v6, v47, v37, 16
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v51
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    v_or_b32_e32 v45, v6, v14
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v9
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    v_or_b32_e32 v49, v6, v31
-; SI-NEXT:    v_alignbit_b32 v6, v45, v31, 16
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v54
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    v_or_b32_e32 v44, v6, v23
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v52
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    v_or_b32_e32 v48, v6, v13
-; SI-NEXT:    v_alignbit_b32 v6, v44, v13, 16
+; SI-NEXT:    v_or_b32_e32 v46, v32, v13
+; SI-NEXT:    v_alignbit_b32 v13, v62, v13, 16
+; SI-NEXT:    v_alignbit_b32 v6, v44, v9, 16
 ; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v63
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v41
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v10
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v34
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr16
 ; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:  .LBB96_2: ; %Flow
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB96_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v52
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v41
-; SI-NEXT:    v_or_b32_e32 v1, v63, v1
+; SI-NEXT:    v_or_b32_e32 v1, v31, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
-; SI-NEXT:    v_or_b32_e32 v2, v13, v2
+; SI-NEXT:    v_or_b32_e32 v2, v9, v2
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v54
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v46
-; SI-NEXT:    v_or_b32_e32 v2, v23, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_or_b32_e32 v2, v4, v2
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v57
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; SI-NEXT:    s_movk_i32 s6, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
-; SI-NEXT:    v_or_b32_e32 v3, v25, v3
+; SI-NEXT:    v_or_b32_e32 v4, v32, v4
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v2, v4, v2
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v14
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v34
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v31, v5
+; SI-NEXT:    v_or_b32_e32 v5, v56, v5
+; SI-NEXT:    v_mov_b32_e32 v30, v16
 ; SI-NEXT:    s_mov_b32 s7, 0x3000000
-; SI-NEXT:    v_mov_b32_e32 v30, v24
+; SI-NEXT:    v_mov_b32_e32 v31, v24
 ; SI-NEXT:    v_add_i32_e32 v44, vcc, s7, v2
-; SI-NEXT:    v_add_i32_e32 v31, vcc, s7, v1
-; SI-NEXT:    v_mov_b32_e32 v48, v31
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v27, vcc, s7, v1
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
-; SI-NEXT:    v_or_b32_e32 v3, v9, v3
 ; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s6, v3
-; SI-NEXT:    v_or_b32_e32 v4, v61, v4
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v4, v4, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
@@ -204001,27 +203882,41 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v37, v7
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v33, vcc, s7, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v49, v33
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_or_b32_e32 v3, v60, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, s6, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v5, v5, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v45, vcc, s7, v5
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
+; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_or_b32_e32 v3, v58, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, s6, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v3, v6, v3
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v4, v60, v4
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v32, vcc, s7, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v48, v32
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    v_or_b32_e32 v4, v63, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, s6, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v45, vcc, s7, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v45
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    v_or_b32_e32 v5, v53, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, s6, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v33, vcc, s7, v5
+; SI-NEXT:    v_mov_b32_e32 v49, v33
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
@@ -204029,16 +203924,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, s6, v6
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v35, vcc, s7, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v47, vcc, s7, v6
-; SI-NEXT:    v_mov_b32_e32 v50, v35
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v7
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, s6, v7
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204047,16 +203939,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v8, v9, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, s7, v7
-; SI-NEXT:    v_mov_b32_e32 v56, v6
+; SI-NEXT:    v_mov_b32_e32 v50, v6
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v8, v9, v8
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, s6, v8
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204065,15 +203957,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_or_b32_e32 v9, v10, v9
 ; SI-NEXT:    v_or_b32_e32 v8, v9, v8
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v59, vcc, s7, v8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v9
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, s6, v9
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204082,15 +203974,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_or_b32_e32 v10, v11, v10
 ; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, s7, v9
+; SI-NEXT:    v_mov_b32_e32 v46, v8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v10
 ; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, s6, v10
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204099,15 +203992,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
 ; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v62, vcc, s7, v10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 3, v11
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, s6, v11
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204116,15 +204009,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v13, v12
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, s7, v11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v12
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v12, v13, v12
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, s6, v12
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204133,15 +204026,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v14, v13
 ; SI-NEXT:    v_or_b32_e32 v12, v13, v12
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, s7, v12
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 3, v13
 ; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v13, v14, v13
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, s6, v13
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204150,48 +204043,45 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    v_or_b32_e32 v13, v14, v13
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, s7, v13
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 3, v14
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, s6, v14
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 3, v15
 ; SI-NEXT:    v_and_b32_e32 v15, 0xff, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_or_b32_e32 v15, v16, v15
+; SI-NEXT:    v_or_b32_e32 v15, v17, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 3, v15
 ; SI-NEXT:    v_and_b32_e32 v15, 0xff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v15, v16, v15
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v15, v17, v15
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, s6, v15
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
-; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; SI-NEXT:    v_or_b32_e32 v16, v17, v16
 ; SI-NEXT:    v_or_b32_e32 v15, v16, v15
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, s7, v15
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v16, v17, v16
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, s6, v16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204200,10 +204090,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; SI-NEXT:    v_or_b32_e32 v17, v18, v17
 ; SI-NEXT:    v_or_b32_e32 v16, v17, v16
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, s7, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
@@ -204218,30 +204107,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_or_b32_e32 v18, v19, v18
 ; SI-NEXT:    v_or_b32_e32 v18, v18, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v16, vcc, s7, v18
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v17, v19, v17
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v17
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 3, v19
+; SI-NEXT:    v_add_i32_e32 v19, vcc, 3, v23
 ; SI-NEXT:    v_and_b32_e32 v19, 0xff, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v17
 ; SI-NEXT:    v_or_b32_e32 v19, v20, v19
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, s7, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v11
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_or_b32_e32 v17, v20, v17
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v17
@@ -204253,14 +204142,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v20, v21, v20
 ; SI-NEXT:    v_or_b32_e32 v20, v20, v17
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v16, vcc, s7, v20
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v17, v21, v17
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204269,15 +204157,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; SI-NEXT:    v_or_b32_e32 v21, v22, v21
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, s7, v21
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v17, v22, v17
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204286,31 +204174,32 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; SI-NEXT:    v_or_b32_e32 v22, v23, v22
 ; SI-NEXT:    v_or_b32_e32 v22, v22, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v27, vcc, s7, v22
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v21, vcc, s7, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v17, v23, v17
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v23
 ; SI-NEXT:    v_and_b32_e32 v23, 0xff, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_or_b32_e32 v23, v25, v23
+; SI-NEXT:    v_or_b32_e32 v23, v26, v23
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v26, v25
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, s7, v23
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v24
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204319,15 +204208,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v17
 ; SI-NEXT:    v_or_b32_e32 v2, v25, v2
-; SI-NEXT:    v_add_i32_e32 v20, vcc, s7, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v18, vcc, s7, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, s7, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, s7, v18
+; SI-NEXT:    v_mov_b32_e32 v14, v27
+; SI-NEXT:    v_add_i32_e32 v27, vcc, s7, v20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204337,10 +204227,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, s7, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v28
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v28
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
@@ -204357,8 +204247,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, s7, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
@@ -204371,17 +204261,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v3, v26, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_add_i32_e32 v55, vcc, s7, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v26, vcc, s7, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v55
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v26
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
@@ -204394,8 +204283,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_add_i32_e32 v40, vcc, s7, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
@@ -204408,29 +204297,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v30, v3
+; SI-NEXT:    v_or_b32_e32 v3, v31, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_add_i32_e32 v42, vcc, s7, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v59
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v42
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
-; SI-NEXT:    v_or_b32_e32 v2, v34, v2
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, s7, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
@@ -204443,27 +204333,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v3, v30, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, s7, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v5
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v59
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
-; SI-NEXT:    v_or_b32_e32 v2, v32, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_add_i32_e32 v12, vcc, s7, v2
+; SI-NEXT:    v_add_i32_e32 v43, vcc, s7, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
@@ -204472,7 +204363,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -204482,239 +204373,236 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, s7, v2
-; SI-NEXT:    v_alignbit_b32 v2, v22, v12, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v2, v22, v43, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v2, v5, v7, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v2, v42, v40, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v2, v26, v39, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v2, v28, v18, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v55, v39, 16
+; SI-NEXT:    v_alignbit_b32 v2, v29, v21, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v28, v20, 16
+; SI-NEXT:    v_alignbit_b32 v2, v19, v27, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v29, v27, 16
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v2, v11, v16, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v19, v16, 16
+; SI-NEXT:    v_alignbit_b32 v2, v15, v13, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v11, v14, 16
+; SI-NEXT:    v_alignbit_b32 v2, v17, v10, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v15, v13, 16
+; SI-NEXT:    v_alignbit_b32 v2, v1, v9, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v17, v10, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v2, v62, v8, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v1, v9, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v2, v59, v6, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v62, v8, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v2, v47, v33, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v59, v6, 16
+; SI-NEXT:    v_alignbit_b32 v2, v45, v32, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v47, v35, 16
+; SI-NEXT:    v_alignbit_b32 v2, v44, v14, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v45, v33, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v44, v31, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v62
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v45
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v47
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v44
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB96_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v12
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v43
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v3, v3, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; SI-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v57
-; SI-NEXT:    v_or_b32_e32 v3, v3, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v58
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v43
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v40
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v36
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v55
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v39
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v53
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v20
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v20
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v21
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v52
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v27
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v21
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v16
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v27
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v38
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v38
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v26
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v46
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -204726,9 +204614,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v56
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v50
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -204740,9 +204628,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v50
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v49
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -204750,13 +204638,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v49
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v48
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -204764,19 +204652,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v45
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v48
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v44
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
@@ -208989,8 +208877,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -209008,21 +208896,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:332
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:328
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:324
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:320
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:308
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:304
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:300
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:296
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:292
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:288
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:332
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:328
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:324
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:320
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:308
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:304
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:300
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:296
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:292
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:272
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:268
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:264
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:260
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:244
@@ -209032,13 +208920,22 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:236
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:232
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:228
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:224
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:208
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:204
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:204
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:200
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:196
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:196
+; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_writelane_b32 v63, s30, 0
+; SI-NEXT:    v_writelane_b32 v62, s28, 0
+; SI-NEXT:    v_writelane_b32 v62, s25, 1
+; SI-NEXT:    v_writelane_b32 v62, s24, 2
+; SI-NEXT:    v_writelane_b32 v62, s23, 3
+; SI-NEXT:    v_writelane_b32 v62, s22, 4
+; SI-NEXT:    v_writelane_b32 v62, s21, 5
+; SI-NEXT:    v_writelane_b32 v62, s18, 6
+; SI-NEXT:    v_writelane_b32 v62, s16, 7
 ; SI-NEXT:    v_writelane_b32 v63, s31, 1
 ; SI-NEXT:    v_writelane_b32 v63, s34, 2
 ; SI-NEXT:    v_writelane_b32 v63, s35, 3
@@ -209051,29 +208948,17 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v63, s50, 10
 ; SI-NEXT:    v_writelane_b32 v63, s51, 11
 ; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v62, s28, 0
 ; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v62, s27, 1
 ; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v62, s26, 2
 ; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v62, s25, 3
 ; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v62, s24, 4
 ; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v62, s23, 5
 ; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v62, s22, 6
 ; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v62, s21, 7
 ; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v62, s20, 8
 ; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v62, s18, 9
 ; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v62, s16, 10
 ; SI-NEXT:    v_writelane_b32 v63, s80, 24
 ; SI-NEXT:    v_writelane_b32 v63, s81, 25
 ; SI-NEXT:    v_writelane_b32 v63, s82, 26
@@ -209085,252 +208970,254 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v63, s96, 32
 ; SI-NEXT:    v_writelane_b32 v63, s97, 33
 ; SI-NEXT:    v_writelane_b32 v63, s98, 34
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v21, v5
 ; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, v26
 ; SI-NEXT:    v_readfirstlane_b32 s15, v16
-; SI-NEXT:    v_readfirstlane_b32 s21, v25
+; SI-NEXT:    v_readfirstlane_b32 s18, v25
 ; SI-NEXT:    v_readfirstlane_b32 s43, v15
 ; SI-NEXT:    v_readfirstlane_b32 s42, v24
 ; SI-NEXT:    v_readfirstlane_b32 s44, v23
+; SI-NEXT:    v_readfirstlane_b32 s49, v12
+; SI-NEXT:    v_readfirstlane_b32 s8, v11
+; SI-NEXT:    v_readfirstlane_b32 s53, v20
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s4, v33
-; SI-NEXT:    v_writelane_b32 v62, s4, 11
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v62, s4, 12
-; SI-NEXT:    v_readfirstlane_b32 s52, v37
-; SI-NEXT:    v_readfirstlane_b32 s82, v48
-; SI-NEXT:    v_readfirstlane_b32 s4, v53
-; SI-NEXT:    v_readfirstlane_b32 s79, v50
-; SI-NEXT:    v_readfirstlane_b32 s88, v52
-; SI-NEXT:    v_writelane_b32 v62, s4, 13
-; SI-NEXT:    v_readfirstlane_b32 s77, v55
-; SI-NEXT:    v_readfirstlane_b32 s4, v41
-; SI-NEXT:    v_readfirstlane_b32 s35, v42
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:192
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:176
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:168
+; SI-NEXT:    v_writelane_b32 v62, s4, 8
+; SI-NEXT:    v_readfirstlane_b32 s4, v38
+; SI-NEXT:    v_writelane_b32 v62, s4, 9
+; SI-NEXT:    v_readfirstlane_b32 s4, v49
+; SI-NEXT:    v_writelane_b32 v62, s4, 10
+; SI-NEXT:    v_readfirstlane_b32 s4, v50
+; SI-NEXT:    v_writelane_b32 v62, s4, 11
+; SI-NEXT:    v_readfirstlane_b32 s79, v52
+; SI-NEXT:    v_readfirstlane_b32 s88, v54
+; SI-NEXT:    v_readfirstlane_b32 s4, v55
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:192
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:180
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:176
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:168
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:164
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:148
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:144
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:140
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:136
-; SI-NEXT:    v_readfirstlane_b32 s16, v31
-; SI-NEXT:    v_readfirstlane_b32 s26, v32
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:148
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:144
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:140
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:136
+; SI-NEXT:    v_writelane_b32 v62, s4, 12
+; SI-NEXT:    v_readfirstlane_b32 s77, v41
+; SI-NEXT:    v_readfirstlane_b32 s4, v42
+; SI-NEXT:    v_readfirstlane_b32 s94, v31
+; SI-NEXT:    v_readfirstlane_b32 s70, v32
+; SI-NEXT:    v_readfirstlane_b32 s51, v33
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s76, v45
-; SI-NEXT:    v_readfirstlane_b32 s66, v56
+; SI-NEXT:    v_readfirstlane_b32 s37, v45
+; SI-NEXT:    v_readfirstlane_b32 s24, v56
 ; SI-NEXT:    v_readfirstlane_b32 s7, v57
 ; SI-NEXT:    v_readfirstlane_b32 s92, v58
-; SI-NEXT:    v_readfirstlane_b32 s27, v59
+; SI-NEXT:    v_readfirstlane_b32 s28, v59
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:80
-; SI-NEXT:    v_readfirstlane_b32 s51, v44
-; SI-NEXT:    v_readfirstlane_b32 s55, v47
-; SI-NEXT:    v_readfirstlane_b32 s6, v35
-; SI-NEXT:    v_readfirstlane_b32 s98, v36
-; SI-NEXT:    v_readfirstlane_b32 s18, v38
+; SI-NEXT:    v_readfirstlane_b32 s35, v43
+; SI-NEXT:    v_readfirstlane_b32 s55, v46
+; SI-NEXT:    v_readfirstlane_b32 s68, v35
+; SI-NEXT:    v_readfirstlane_b32 s87, v37
 ; SI-NEXT:    v_readfirstlane_b32 s67, v39
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:316
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s34, v54
-; SI-NEXT:    v_readfirstlane_b32 s69, v51
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:68
+; SI-NEXT:    v_readfirstlane_b32 s74, v53
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:40
-; SI-NEXT:    v_readfirstlane_b32 s87, v40
-; SI-NEXT:    v_readfirstlane_b32 s86, v49
-; SI-NEXT:    v_writelane_b32 v62, s4, 14
-; SI-NEXT:    v_writelane_b32 v62, s17, 15
-; SI-NEXT:    v_writelane_b32 v62, s15, 16
-; SI-NEXT:    v_writelane_b32 v62, s21, 17
-; SI-NEXT:    v_writelane_b32 v62, s43, 18
-; SI-NEXT:    v_writelane_b32 v62, s42, 19
-; SI-NEXT:    v_writelane_b32 v62, s44, 20
-; SI-NEXT:    v_readfirstlane_b32 s53, v12
-; SI-NEXT:    v_readfirstlane_b32 s23, v11
-; SI-NEXT:    v_readfirstlane_b32 s8, v20
-; SI-NEXT:    v_readfirstlane_b32 s48, v19
-; SI-NEXT:    v_readfirstlane_b32 s63, v28
-; SI-NEXT:    v_readfirstlane_b32 s95, v27
-; SI-NEXT:    v_mov_b32_e32 v29, v13
-; SI-NEXT:    v_readfirstlane_b32 s97, v26
+; SI-NEXT:    v_readfirstlane_b32 s85, v48
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40
+; SI-NEXT:    v_writelane_b32 v62, s4, 13
+; SI-NEXT:    v_readfirstlane_b32 s98, v40
+; SI-NEXT:    v_readfirstlane_b32 s69, v51
+; SI-NEXT:    v_readfirstlane_b32 s21, v36
+; SI-NEXT:    v_readfirstlane_b32 s40, v19
+; SI-NEXT:    v_readfirstlane_b32 s23, v28
+; SI-NEXT:    v_readfirstlane_b32 s34, v27
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v21, v13
+; SI-NEXT:    v_mov_b32_e32 v13, v5
+; SI-NEXT:    v_readfirstlane_b32 s97, v29
 ; SI-NEXT:    v_readfirstlane_b32 s80, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v30
 ; SI-NEXT:    v_readfirstlane_b32 s96, v17
-; SI-NEXT:    v_readfirstlane_b32 s65, v10
 ; SI-NEXT:    v_readfirstlane_b32 s64, v9
-; SI-NEXT:    v_readfirstlane_b32 s68, v8
+; SI-NEXT:    v_readfirstlane_b32 s25, v8
 ; SI-NEXT:    v_readfirstlane_b32 s83, v7
 ; SI-NEXT:    v_readfirstlane_b32 s84, v4
 ; SI-NEXT:    v_readfirstlane_b32 s93, v3
-; SI-NEXT:    v_readfirstlane_b32 s90, v2
-; SI-NEXT:    v_readfirstlane_b32 s11, v1
-; SI-NEXT:    v_readfirstlane_b32 s59, v37
-; SI-NEXT:    v_readfirstlane_b32 s94, v50
-; SI-NEXT:    v_readfirstlane_b32 s39, v53
-; SI-NEXT:    v_readfirstlane_b32 s81, v43
-; SI-NEXT:    v_readfirstlane_b32 s71, v46
-; SI-NEXT:    v_readfirstlane_b32 s85, v60
-; SI-NEXT:    v_readfirstlane_b32 s89, v61
-; SI-NEXT:    v_readfirstlane_b32 s49, v33
-; SI-NEXT:    v_readfirstlane_b32 s70, v34
+; SI-NEXT:    v_readfirstlane_b32 s76, v1
+; SI-NEXT:    v_readfirstlane_b32 s58, v38
+; SI-NEXT:    v_readfirstlane_b32 s65, v49
+; SI-NEXT:    v_readfirstlane_b32 s62, v54
+; SI-NEXT:    v_readfirstlane_b32 s81, v44
+; SI-NEXT:    v_readfirstlane_b32 s71, v47
+; SI-NEXT:    v_readfirstlane_b32 s38, v60
+; SI-NEXT:    v_readfirstlane_b32 s86, v61
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:156
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:220
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s74, v31
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_readfirstlane_b32 s91, v52
+; SI-NEXT:    v_readfirstlane_b32 s90, v50
+; SI-NEXT:    v_readfirstlane_b32 s31, v52
+; SI-NEXT:    v_readfirstlane_b32 s4, v55
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:336
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32
-; SI-NEXT:    v_readfirstlane_b32 s37, v56
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s38, v59
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32
+; SI-NEXT:    v_readfirstlane_b32 s72, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:316
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:72
-; SI-NEXT:    v_readfirstlane_b32 s25, v57
-; SI-NEXT:    v_readfirstlane_b32 s56, v58
-; SI-NEXT:    v_readfirstlane_b32 s57, v55
-; SI-NEXT:    v_readfirstlane_b32 s58, v41
+; SI-NEXT:    v_readfirstlane_b32 s82, v56
+; SI-NEXT:    v_readfirstlane_b32 s95, v57
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_readfirstlane_b32 s39, v58
+; SI-NEXT:    v_readfirstlane_b32 s56, v59
+; SI-NEXT:    v_readfirstlane_b32 s57, v41
 ; SI-NEXT:    v_readfirstlane_b32 s36, v42
-; SI-NEXT:    v_readfirstlane_b32 s40, v45
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:284
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:252
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_readfirstlane_b32 s75, v32
+; SI-NEXT:    v_readfirstlane_b32 s73, v45
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:284
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:252
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:188
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:124
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_readfirstlane_b32 s16, v34
+; SI-NEXT:    v_readfirstlane_b32 s48, v32
+; SI-NEXT:    v_readfirstlane_b32 s52, v33
+; SI-NEXT:    v_writelane_b32 v62, s4, 14
 ; SI-NEXT:    v_readfirstlane_b32 s47, v35
-; SI-NEXT:    v_writelane_b32 v62, s56, 21
-; SI-NEXT:    v_writelane_b32 v62, s49, 22
-; SI-NEXT:    v_readfirstlane_b32 s72, v38
-; SI-NEXT:    v_readfirstlane_b32 s73, v39
+; SI-NEXT:    v_readfirstlane_b32 s60, v37
+; SI-NEXT:    v_readfirstlane_b32 s61, v39
+; SI-NEXT:    v_readfirstlane_b32 s89, v43
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s22, v44
-; SI-NEXT:    v_readfirstlane_b32 s99, v47
-; SI-NEXT:    v_writelane_b32 v62, s53, 23
-; SI-NEXT:    v_writelane_b32 v62, s70, 24
-; SI-NEXT:    v_writelane_b32 v62, s23, 25
-; SI-NEXT:    v_writelane_b32 v62, s57, 26
-; SI-NEXT:    v_readfirstlane_b32 s54, v51
-; SI-NEXT:    v_readfirstlane_b32 s50, v54
-; SI-NEXT:    v_readfirstlane_b32 s31, v48
-; SI-NEXT:    v_readfirstlane_b32 s78, v49
-; SI-NEXT:    v_readfirstlane_b32 s30, v50
-; SI-NEXT:    v_readfirstlane_b32 s24, v53
-; SI-NEXT:    v_readfirstlane_b32 s28, v40
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s20, v43
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v52
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s45, v46
-; SI-NEXT:    v_writelane_b32 v62, s45, 27
-; SI-NEXT:    v_writelane_b32 v62, s8, 28
-; SI-NEXT:    v_writelane_b32 v62, s58, 29
-; SI-NEXT:    v_writelane_b32 v62, s59, 30
-; SI-NEXT:    v_writelane_b32 v62, s47, 31
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s60, v36
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:312
+; SI-NEXT:    v_readfirstlane_b32 s99, v46
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:312
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:280
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:248
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:216
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:184
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:152
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:120
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24
-; SI-NEXT:    v_writelane_b32 v62, s48, 32
-; SI-NEXT:    v_writelane_b32 v62, s26, 33
-; SI-NEXT:    v_readfirstlane_b32 s46, v60
-; SI-NEXT:    v_writelane_b32 v62, s60, 34
-; SI-NEXT:    v_readfirstlane_b32 s61, v61
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:184
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:152
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:24
+; SI-NEXT:    v_readfirstlane_b32 s54, v48
+; SI-NEXT:    v_readfirstlane_b32 s50, v53
+; SI-NEXT:    v_readfirstlane_b32 s78, v49
+; SI-NEXT:    v_readfirstlane_b32 s30, v51
+; SI-NEXT:    v_readfirstlane_b32 s66, v54
+; SI-NEXT:    v_readfirstlane_b32 s91, v40
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_readfirstlane_b32 s6, v44
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v50
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s4, v10
+; SI-NEXT:    v_writelane_b32 v62, s4, 15
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
+; SI-NEXT:    v_writelane_b32 v62, s4, 16
+; SI-NEXT:    v_writelane_b32 v62, s17, 17
+; SI-NEXT:    v_writelane_b32 v62, s15, 18
+; SI-NEXT:    v_writelane_b32 v62, s18, 19
+; SI-NEXT:    v_writelane_b32 v62, s43, 20
+; SI-NEXT:    v_writelane_b32 v62, s42, 21
+; SI-NEXT:    v_writelane_b32 v62, s44, 22
+; SI-NEXT:    v_writelane_b32 v62, s16, 23
+; SI-NEXT:    v_writelane_b32 v62, s49, 24
+; SI-NEXT:    v_writelane_b32 v62, s8, 25
+; SI-NEXT:    v_writelane_b32 v62, s6, 26
+; SI-NEXT:    v_readfirstlane_b32 s45, v52
+; SI-NEXT:    v_writelane_b32 v62, s56, 27
+; SI-NEXT:    v_writelane_b32 v62, s45, 28
+; SI-NEXT:    v_writelane_b32 v62, s53, 29
+; SI-NEXT:    v_writelane_b32 v62, s94, 30
+; SI-NEXT:    v_writelane_b32 v62, s57, 31
+; SI-NEXT:    v_writelane_b32 v62, s58, 32
+; SI-NEXT:    v_writelane_b32 v62, s47, 33
+; SI-NEXT:    v_readfirstlane_b32 s46, v55
+; SI-NEXT:    v_writelane_b32 v62, s40, 34
+; SI-NEXT:    v_readfirstlane_b32 s59, v47
 ; SI-NEXT:    v_writelane_b32 v62, s46, 35
-; SI-NEXT:    v_writelane_b32 v62, s61, 36
-; SI-NEXT:    v_writelane_b32 v62, s72, 37
+; SI-NEXT:    v_writelane_b32 v62, s59, 36
+; SI-NEXT:    v_writelane_b32 v62, s60, 37
 ; SI-NEXT:    v_writelane_b32 v62, s36, 38
-; SI-NEXT:    v_writelane_b32 v62, s94, 39
-; SI-NEXT:    v_writelane_b32 v62, s73, 40
-; SI-NEXT:    v_writelane_b32 v62, s40, 41
-; SI-NEXT:    v_writelane_b32 v62, s39, 42
-; SI-NEXT:    v_writelane_b32 v62, s74, 43
-; SI-NEXT:    v_writelane_b32 v62, s63, 44
-; SI-NEXT:    v_writelane_b32 v62, s75, 45
-; SI-NEXT:    v_writelane_b32 v62, s95, 46
+; SI-NEXT:    v_writelane_b32 v62, s65, 39
+; SI-NEXT:    v_writelane_b32 v62, s61, 40
+; SI-NEXT:    v_writelane_b32 v62, s73, 41
+; SI-NEXT:    v_writelane_b32 v62, s62, 42
+; SI-NEXT:    v_writelane_b32 v62, s72, 43
+; SI-NEXT:    v_writelane_b32 v62, s23, 44
+; SI-NEXT:    v_writelane_b32 v62, s48, 45
+; SI-NEXT:    v_writelane_b32 v62, s34, 46
 ; SI-NEXT:    v_writelane_b32 v62, s78, 47
 ; SI-NEXT:    v_writelane_b32 v62, s30, 48
 ; SI-NEXT:    v_writelane_b32 v62, s54, 49
 ; SI-NEXT:    v_writelane_b32 v62, s50, 50
-; SI-NEXT:    v_writelane_b32 v62, s25, 51
-; SI-NEXT:    v_writelane_b32 v62, s24, 52
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 24, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v30
+; SI-NEXT:    v_writelane_b32 v62, s52, 51
+; SI-NEXT:    v_writelane_b32 v62, s82, 52
+; SI-NEXT:    v_writelane_b32 v62, s66, 53
+; SI-NEXT:    v_readfirstlane_b32 s22, v36
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v56
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 24, v57
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v58
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v45
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 24, v59
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 24, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 24, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 24, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 24, v5
-; SI-NEXT:    v_writelane_b32 v62, s28, 53
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 24, v59
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 24, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 24, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 24, v61
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 24, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 24, v31
+; SI-NEXT:    v_writelane_b32 v62, s91, 54
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB97_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s4, v62, 8
-; SI-NEXT:    v_readlane_b32 s5, v62, 7
-; SI-NEXT:    v_mov_b32_e32 v13, v21
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v5, v13
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    v_readlane_b32 s5, v62, 5
+; SI-NEXT:    s_and_b32 s4, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_writelane_b32 v62, s4, 54
-; SI-NEXT:    v_readlane_b32 s4, v62, 6
+; SI-NEXT:    v_writelane_b32 v62, s4, 55
+; SI-NEXT:    v_readlane_b32 s4, v62, 4
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    v_readlane_b32 s5, v62, 5
+; SI-NEXT:    v_readlane_b32 s5, v62, 3
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s5, 24
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_writelane_b32 v62, s4, 55
-; SI-NEXT:    v_readlane_b32 s4, v62, 9
+; SI-NEXT:    s_or_b32 s63, s5, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 6
 ; SI-NEXT:    s_and_b32 s5, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
 ; SI-NEXT:    s_lshl_b32 s9, s19, 24
@@ -209340,29 +209227,27 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s10, s29, 8
 ; SI-NEXT:    s_or_b32 s4, s5, s10
 ; SI-NEXT:    v_writelane_b32 v62, s4, 56
-; SI-NEXT:    v_writelane_b32 v62, s37, 57
-; SI-NEXT:    s_and_b32 s5, s11, 0xff
-; SI-NEXT:    s_mov_b32 s37, s11
+; SI-NEXT:    s_and_b32 s5, s76, 0xff
+; SI-NEXT:    v_readlane_b32 s10, v62, 16
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s11, s90, 24
-; SI-NEXT:    v_readlane_b32 s4, v62, 2
+; SI-NEXT:    s_lshl_b32 s11, s10, 24
 ; SI-NEXT:    s_or_b32 s5, s11, s5
-; SI-NEXT:    s_and_b32 s11, s4, 0xff
-; SI-NEXT:    v_readlane_b32 s4, v62, 1
+; SI-NEXT:    s_and_b32 s11, s26, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s12, s4, 24
+; SI-NEXT:    s_lshl_b32 s12, s27, 24
 ; SI-NEXT:    s_or_b32 s14, s12, s11
 ; SI-NEXT:    s_and_b32 s11, s83, 0xff
-; SI-NEXT:    s_lshl_b32 s12, s68, 8
-; SI-NEXT:    s_or_b32 s4, s11, s12
+; SI-NEXT:    s_lshl_b32 s12, s25, 8
+; SI-NEXT:    s_or_b32 s10, s11, s12
+; SI-NEXT:    v_writelane_b32 v62, s10, 57
 ; SI-NEXT:    s_and_b32 s11, s64, 0xff
+; SI-NEXT:    v_readlane_b32 s10, v62, 15
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s13, s65, 24
+; SI-NEXT:    s_lshl_b32 s13, s10, 24
 ; SI-NEXT:    s_or_b32 s41, s13, s11
 ; SI-NEXT:    s_and_b32 s11, s43, 0xff
 ; SI-NEXT:    s_lshl_b32 s13, s15, 8
-; SI-NEXT:    v_writelane_b32 v62, s4, 58
-; SI-NEXT:    s_or_b32 s4, s11, s13
+; SI-NEXT:    s_or_b32 s10, s11, s13
 ; SI-NEXT:    s_and_b32 s11, s96, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
 ; SI-NEXT:    s_lshl_b32 s15, s80, 24
@@ -209370,269 +209255,266 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s11, s44, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s42, 8
 ; SI-NEXT:    s_or_b32 s13, s11, s15
-; SI-NEXT:    s_and_b32 s11, s21, 0xff
+; SI-NEXT:    s_and_b32 s11, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
 ; SI-NEXT:    s_lshl_b32 s15, s97, 24
 ; SI-NEXT:    s_or_b32 s44, s15, s11
-; SI-NEXT:    s_and_b32 s11, s61, 0xff
+; SI-NEXT:    s_and_b32 s11, s59, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s46, 8
 ; SI-NEXT:    s_or_b32 s12, s11, s15
 ; SI-NEXT:    s_and_b32 s11, s45, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s20, 24
+; SI-NEXT:    s_lshl_b32 s15, s6, 24
 ; SI-NEXT:    s_or_b32 s45, s15, s11
 ; SI-NEXT:    s_and_b32 s11, s30, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s78, 8
+; SI-NEXT:    v_writelane_b32 v62, s10, 58
 ; SI-NEXT:    s_or_b32 s10, s11, s15
 ; SI-NEXT:    s_and_b32 s11, s99, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s22, 24
+; SI-NEXT:    s_lshl_b32 s15, s89, 24
 ; SI-NEXT:    s_or_b32 s46, s15, s11
-; SI-NEXT:    s_and_b32 s11, s73, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s72, 8
-; SI-NEXT:    v_writelane_b32 v62, s4, 59
-; SI-NEXT:    s_or_b32 s4, s11, s15
-; SI-NEXT:    s_and_b32 s11, s60, 0xff
+; SI-NEXT:    s_and_b32 s11, s61, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s60, 8
+; SI-NEXT:    s_or_b32 s6, s11, s15
+; SI-NEXT:    s_and_b32 s11, s22, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
 ; SI-NEXT:    s_lshl_b32 s15, s47, 24
 ; SI-NEXT:    s_or_b32 s47, s15, s11
-; SI-NEXT:    s_and_b32 s11, s58, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s57, 8
-; SI-NEXT:    s_mov_b32 s62, s16
-; SI-NEXT:    s_or_b32 s16, s11, s15
-; SI-NEXT:    s_and_b32 s11, s38, 0xff
+; SI-NEXT:    s_and_b32 s11, s57, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s56, 8
+; SI-NEXT:    v_writelane_b32 v62, s6, 59
+; SI-NEXT:    s_or_b32 s6, s11, s15
+; SI-NEXT:    s_and_b32 s11, s39, 0xff
+; SI-NEXT:    v_writelane_b32 v62, s6, 60
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s56, 24
+; SI-NEXT:    s_lshl_b32 s15, s95, 24
 ; SI-NEXT:    s_or_b32 s56, s15, s11
-; SI-NEXT:    s_and_b32 s11, s75, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s74, 8
-; SI-NEXT:    s_or_b32 s73, s11, s15
-; SI-NEXT:    s_and_b32 s11, s91, 0xff
+; SI-NEXT:    s_and_b32 s11, s48, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s72, 8
+; SI-NEXT:    v_readlane_b32 s6, v62, 14
+; SI-NEXT:    s_or_b32 s48, s11, s15
+; SI-NEXT:    s_and_b32 s11, s6, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
 ; SI-NEXT:    s_lshl_b32 s15, s31, 24
 ; SI-NEXT:    s_or_b32 vcc_lo, s15, s11
-; SI-NEXT:    s_and_b32 s11, s89, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s85, 8
+; SI-NEXT:    s_and_b32 s11, s86, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s38, 8
 ; SI-NEXT:    s_or_b32 s72, s11, s15
 ; SI-NEXT:    s_and_b32 s11, s71, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
 ; SI-NEXT:    s_lshl_b32 s15, s81, 24
 ; SI-NEXT:    s_or_b32 vcc_hi, s15, s11
-; SI-NEXT:    s_and_b32 s11, s59, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s86, 8
+; SI-NEXT:    s_and_b32 s11, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s85, 8
 ; SI-NEXT:    s_or_b32 s57, s11, s15
 ; SI-NEXT:    s_and_b32 s11, s69, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s34, 24
+; SI-NEXT:    s_lshl_b32 s15, s74, 24
+; SI-NEXT:    v_writelane_b32 v62, s74, 61
 ; SI-NEXT:    s_or_b32 s74, s15, s11
-; SI-NEXT:    s_and_b32 s11, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s98, 8
+; SI-NEXT:    s_and_b32 s11, s87, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s21, 8
 ; SI-NEXT:    s_or_b32 s58, s11, s15
-; SI-NEXT:    s_and_b32 s11, s6, 0xff
-; SI-NEXT:    v_writelane_b32 v62, s4, 60
+; SI-NEXT:    s_and_b32 s11, s68, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s27, 24
-; SI-NEXT:    v_writelane_b32 v62, s34, 61
+; SI-NEXT:    s_lshl_b32 s15, s28, 24
 ; SI-NEXT:    s_or_b32 s75, s15, s11
-; SI-NEXT:    s_and_b32 s11, s66, 0xff
+; SI-NEXT:    s_and_b32 s11, s24, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s55, 8
-; SI-NEXT:    v_writelane_b32 v62, s6, 62
+; SI-NEXT:    v_writelane_b32 v62, s25, 62
 ; SI-NEXT:    s_or_b32 s59, s11, s15
-; SI-NEXT:    s_and_b32 s11, s76, 0xff
+; SI-NEXT:    s_and_b32 s11, s37, 0xff
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
 ; SI-NEXT:    s_lshl_b32 s15, s51, 24
-; SI-NEXT:    v_writelane_b32 v62, s85, 63
-; SI-NEXT:    s_mov_b32 s4, s97
-; SI-NEXT:    s_mov_b32 s97, s76
+; SI-NEXT:    v_readlane_b32 s4, v62, 13
+; SI-NEXT:    s_mov_b32 s18, s21
+; SI-NEXT:    s_mov_b32 s21, s97
+; SI-NEXT:    s_mov_b32 s97, s37
+; SI-NEXT:    s_mov_b32 s37, s76
 ; SI-NEXT:    s_or_b32 s76, s15, s11
-; SI-NEXT:    v_readlane_b32 s15, v62, 14
 ; SI-NEXT:    s_and_b32 s11, s35, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s15, 8
+; SI-NEXT:    s_lshl_b32 s15, s4, 8
 ; SI-NEXT:    s_or_b32 s60, s11, s15
 ; SI-NEXT:    s_and_b32 s11, s77, 0xff
-; SI-NEXT:    v_readlane_b32 s15, v62, 13
+; SI-NEXT:    v_readlane_b32 s4, v62, 12
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s15, 24
-; SI-NEXT:    s_mov_b32 s21, s20
-; SI-NEXT:    s_mov_b32 s20, s38
-; SI-NEXT:    s_mov_b32 s38, s99
+; SI-NEXT:    s_lshl_b32 s15, s4, 24
+; SI-NEXT:    v_readlane_b32 s4, v62, 11
+; SI-NEXT:    s_mov_b32 s6, s95
+; SI-NEXT:    s_mov_b32 s95, s39
+; SI-NEXT:    s_mov_b32 s39, s89
+; SI-NEXT:    s_mov_b32 s89, s99
 ; SI-NEXT:    s_mov_b32 s99, s83
 ; SI-NEXT:    s_mov_b32 s83, s55
 ; SI-NEXT:    s_mov_b32 s55, s64
 ; SI-NEXT:    s_mov_b32 s64, s35
 ; SI-NEXT:    s_mov_b32 s35, s77
 ; SI-NEXT:    s_or_b32 s77, s15, s11
-; SI-NEXT:    s_and_b32 s11, s82, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s52, 8
+; SI-NEXT:    s_and_b32 s11, s4, 0xff
+; SI-NEXT:    v_readlane_b32 s4, v62, 10
+; SI-NEXT:    s_lshl_b32 s15, s4, 8
+; SI-NEXT:    v_readlane_b32 s4, v62, 9
 ; SI-NEXT:    s_or_b32 s61, s11, s15
-; SI-NEXT:    v_readlane_b32 s11, v62, 12
-; SI-NEXT:    s_and_b32 s11, s11, 0xff
-; SI-NEXT:    v_readlane_b32 s15, v62, 11
+; SI-NEXT:    s_and_b32 s11, s4, 0xff
+; SI-NEXT:    v_readlane_b32 s4, v62, 8
 ; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s15, 24
+; SI-NEXT:    s_lshl_b32 s15, s4, 24
 ; SI-NEXT:    s_or_b32 s78, s15, s11
-; SI-NEXT:    v_readlane_b32 s11, v62, 10
+; SI-NEXT:    v_readlane_b32 s11, v62, 7
 ; SI-NEXT:    s_and_b32 s11, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s17, 8
 ; SI-NEXT:    s_or_b32 s11, s11, s15
 ; SI-NEXT:    s_and_b32 s11, s11, 0xffff
-; SI-NEXT:    s_mov_b32 s30, s18
-; SI-NEXT:    s_mov_b32 s18, s89
-; SI-NEXT:    s_mov_b32 s89, s98
-; SI-NEXT:    s_mov_b32 s98, s96
-; SI-NEXT:    s_mov_b32 s96, s66
-; SI-NEXT:    s_mov_b32 s66, s82
-; SI-NEXT:    s_mov_b32 s82, s52
 ; SI-NEXT:    v_mov_b32_e32 v51, s9
-; SI-NEXT:    s_or_b32 s52, s11, s9
-; SI-NEXT:    v_readlane_b32 s9, v62, 4
-; SI-NEXT:    v_readlane_b32 s11, v62, 3
+; SI-NEXT:    s_or_b32 s17, s11, s9
+; SI-NEXT:    v_readlane_b32 s9, v62, 2
+; SI-NEXT:    v_readlane_b32 s11, v62, 1
 ; SI-NEXT:    s_and_b32 s9, s9, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s11, 8
 ; SI-NEXT:    s_or_b32 s9, s9, s15
 ; SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; SI-NEXT:    s_mov_b32 s4, s96
+; SI-NEXT:    s_mov_b32 s96, s24
 ; SI-NEXT:    v_mov_b32_e32 v52, s14
-; SI-NEXT:    s_or_b32 s17, s9, s14
+; SI-NEXT:    s_or_b32 s24, s9, s14
 ; SI-NEXT:    s_and_b32 s14, s93, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s84, 8
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v53, v60, v1
+; SI-NEXT:    v_or_b32_e32 v53, v6, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v50, s14, v53
-; SI-NEXT:    s_and_b32 s14, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s53, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v29
+; SI-NEXT:    s_and_b32 s14, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s49, 8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v21
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v54, v61, v1
+; SI-NEXT:    v_or_b32_e32 v54, v14, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v17, s14, v54
-; SI-NEXT:    s_and_b32 s14, s48, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s8, 8
+; SI-NEXT:    s_and_b32 s14, s40, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s53, 8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v19
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v29
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v55, v18, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v16, s14, v55
-; SI-NEXT:    s_and_b32 s14, s95, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s63, 8
+; SI-NEXT:    s_and_b32 s14, s34, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s23, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v21
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v40, v6, v1
+; SI-NEXT:    v_or_b32_e32 v40, v19, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v15, s14, v40
-; SI-NEXT:    s_and_b32 s14, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s24, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v44
+; SI-NEXT:    s_and_b32 s14, s91, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s66, 8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v43
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v41, v20, v1
+; SI-NEXT:    v_or_b32_e32 v41, v22, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v12, s14, v41
 ; SI-NEXT:    s_and_b32 s14, s50, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s54, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v47
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v32
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v42, v22, v1
+; SI-NEXT:    v_or_b32_e32 v42, v23, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v11, s14, v42
-; SI-NEXT:    s_and_b32 s14, s40, 0xff
+; SI-NEXT:    s_and_b32 s14, s73, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s36, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v34
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v46
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v59, v23, v1
+; SI-NEXT:    v_or_b32_e32 v28, v59, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_readlane_b32 s8, v62, 57
-; SI-NEXT:    v_or_b32_e32 v10, s14, v59
-; SI-NEXT:    s_and_b32 s14, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s8, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; SI-NEXT:    v_or_b32_e32 v10, s14, v28
+; SI-NEXT:    s_and_b32 s14, s82, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s52, 8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v26
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mov_b32_e32 v25, v20
-; SI-NEXT:    v_mov_b32_e32 v20, v60
-; SI-NEXT:    v_or_b32_e32 v60, v14, v1
+; SI-NEXT:    v_or_b32_e32 v60, v24, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v9, s14, v60
-; SI-NEXT:    s_and_b32 s14, s70, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s49, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v36
+; SI-NEXT:    s_and_b32 s14, s90, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s16, 8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v35
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v5, v43, v1
+; SI-NEXT:    v_or_b32_e32 v31, v44, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v8, s14, v5
-; SI-NEXT:    s_and_b32 s14, s39, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s94, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; SI-NEXT:    v_or_b32_e32 v8, s14, v31
+; SI-NEXT:    s_and_b32 s14, s62, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s65, 8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v37
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v31, v45, v1
+; SI-NEXT:    v_or_b32_e32 v61, v45, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v7, s14, v31
-; SI-NEXT:    s_and_b32 s14, s87, 0xff
+; SI-NEXT:    v_or_b32_e32 v7, s14, v61
+; SI-NEXT:    s_and_b32 s14, s98, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s67, 8
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v38
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mov_b32_e32 v26, v34
-; SI-NEXT:    v_mov_b32_e32 v34, v22
-; SI-NEXT:    v_mov_b32_e32 v22, v61
-; SI-NEXT:    v_or_b32_e32 v61, v46, v1
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_or_b32_e32 v6, v47, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v4, s14, v61
+; SI-NEXT:    v_or_b32_e32 v4, s14, v6
 ; SI-NEXT:    s_and_b32 s14, s92, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s7, 8
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v33
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_readlane_b32 s8, v62, 54
+; SI-NEXT:    v_readlane_b32 s8, v62, 55
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_or_b32_e32 v6, v56, v1
+; SI-NEXT:    v_mov_b32_e32 v22, v14
+; SI-NEXT:    v_or_b32_e32 v14, v56, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; SI-NEXT:    v_readlane_b32 s11, v62, 55
-; SI-NEXT:    v_mov_b32_e32 v28, v36
-; SI-NEXT:    v_or_b32_e32 v36, s14, v6
-; SI-NEXT:    s_and_b32 s14, s26, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s62, 8
+; SI-NEXT:    v_or_b32_e32 v2, s14, v14
+; SI-NEXT:    s_and_b32 s14, s70, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s94, 8
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; SI-NEXT:    s_or_b32 s42, s8, s11
+; SI-NEXT:    s_or_b32 s42, s8, s63
 ; SI-NEXT:    v_readlane_b32 s8, v62, 56
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_or_b32_e32 v14, v57, v1
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v32, v23
+; SI-NEXT:    v_mov_b32_e32 v23, v18
+; SI-NEXT:    v_or_b32_e32 v18, v57, v1
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    s_or_b32 s40, s8, s5
-; SI-NEXT:    v_readlane_b32 s8, v62, 58
-; SI-NEXT:    v_or_b32_e32 v1, s14, v14
+; SI-NEXT:    v_readlane_b32 s8, v62, 57
+; SI-NEXT:    v_or_b32_e32 v1, s14, v18
 ; SI-NEXT:    s_and_b32 s14, s88, 0xff
 ; SI-NEXT:    s_lshl_b32 s15, s79, 8
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v37
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v34
 ; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    v_readlane_b32 s9, v62, 60
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_or_b32 s15, s8, s41
-; SI-NEXT:    v_readlane_b32 s8, v62, 59
-; SI-NEXT:    s_and_b32 s16, s16, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v27, v35
-; SI-NEXT:    v_mov_b32_e32 v35, v23
-; SI-NEXT:    v_mov_b32_e32 v23, v18
-; SI-NEXT:    v_or_b32_e32 v18, v58, v3
+; SI-NEXT:    v_readlane_b32 s8, v62, 58
+; SI-NEXT:    s_and_b32 s16, s9, 0xffff
+; SI-NEXT:    v_mov_b32_e32 v27, v26
+; SI-NEXT:    v_mov_b32_e32 v26, v24
+; SI-NEXT:    v_mov_b32_e32 v24, v19
+; SI-NEXT:    v_or_b32_e32 v19, v58, v3
 ; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; SI-NEXT:    s_or_b32 s36, s16, s56
-; SI-NEXT:    s_and_b32 s16, s73, 0xffff
-; SI-NEXT:    v_or_b32_e32 v3, s14, v18
+; SI-NEXT:    s_and_b32 s16, s48, 0xffff
+; SI-NEXT:    v_or_b32_e32 v3, s14, v19
 ; SI-NEXT:    s_or_b32 s14, s8, s43
 ; SI-NEXT:    s_and_b32 s8, s13, 0xffff
 ; SI-NEXT:    s_or_b32 s53, s16, vcc_lo
@@ -209646,46 +209528,49 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s49, s16, s74
 ; SI-NEXT:    s_and_b32 s16, s58, 0xffff
 ; SI-NEXT:    s_or_b32 s10, s8, s46
-; SI-NEXT:    v_readlane_b32 s8, v62, 60
+; SI-NEXT:    v_readlane_b32 s8, v62, 59
 ; SI-NEXT:    s_or_b32 s48, s16, s75
 ; SI-NEXT:    s_and_b32 s16, s59, 0xffff
 ; SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; SI-NEXT:    s_or_b32 s39, s16, s76
+; SI-NEXT:    s_or_b32 s11, s16, s76
 ; SI-NEXT:    s_and_b32 s16, s60, 0xffff
 ; SI-NEXT:    s_and_b32 s23, s61, 0xffff
+; SI-NEXT:    s_mov_b32 s30, s87
+; SI-NEXT:    s_mov_b32 s87, s85
 ; SI-NEXT:    s_or_b32 s8, s8, s47
-; SI-NEXT:    s_or_b32 s70, s16, s77
-; SI-NEXT:    s_or_b32 s9, s23, s78
-; SI-NEXT:    v_mov_b32_e32 v24, v45
-; SI-NEXT:    v_mov_b32_e32 v48, v46
-; SI-NEXT:    v_mov_b32_e32 v30, v56
-; SI-NEXT:    v_mov_b32_e32 v49, v57
-; SI-NEXT:    v_mov_b32_e32 v2, v58
+; SI-NEXT:    s_or_b32 s9, s16, s77
+; SI-NEXT:    s_or_b32 s16, s23, s78
+; SI-NEXT:    v_mov_b32_e32 v36, v35
+; SI-NEXT:    v_mov_b32_e32 v30, v37
+; SI-NEXT:    v_mov_b32_e32 v35, v45
+; SI-NEXT:    v_mov_b32_e32 v20, v47
+; SI-NEXT:    v_mov_b32_e32 v49, v56
+; SI-NEXT:    v_mov_b32_e32 v48, v39
+; SI-NEXT:    v_mov_b32_e32 v39, v57
+; SI-NEXT:    v_mov_b32_e32 v25, v58
 ; SI-NEXT:    v_alignbit_b32 v57, s42, v51, 16
 ; SI-NEXT:    v_alignbit_b32 v58, s40, v52, 16
 ; SI-NEXT:    v_alignbit_b32 v56, s15, v53, 16
 ; SI-NEXT:    v_alignbit_b32 v47, s14, v54, 16
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_alignbit_b32 v46, s13, v55, 16
 ; SI-NEXT:    v_alignbit_b32 v45, s12, v40, 16
-; SI-NEXT:    v_alignbit_b32 v44, s10, v41, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v44, s10, v41, 16
 ; SI-NEXT:    v_alignbit_b32 v43, s8, v42, 16
-; SI-NEXT:    v_alignbit_b32 v42, s36, v59, 16
+; SI-NEXT:    v_alignbit_b32 v42, s36, v28, 16
 ; SI-NEXT:    v_alignbit_b32 v41, s53, v60, 16
-; SI-NEXT:    v_mov_b32_e32 v60, v20
-; SI-NEXT:    v_mov_b32_e32 v20, v25
-; SI-NEXT:    v_alignbit_b32 v40, s94, v5, 16
-; SI-NEXT:    v_alignbit_b32 v55, s49, v31, 16
-; SI-NEXT:    v_alignbit_b32 v54, s48, v61, 16
-; SI-NEXT:    v_mov_b32_e32 v61, v22
-; SI-NEXT:    v_mov_b32_e32 v22, v34
-; SI-NEXT:    v_alignbit_b32 v53, s39, v6, 16
-; SI-NEXT:    s_mov_b32 s16, s62
-; SI-NEXT:    v_alignbit_b32 v52, s70, v14, 16
-; SI-NEXT:    v_alignbit_b32 v51, s9, v18, 16
+; SI-NEXT:    v_alignbit_b32 v40, s94, v31, 16
+; SI-NEXT:    v_alignbit_b32 v55, s49, v61, 16
+; SI-NEXT:    v_alignbit_b32 v54, s48, v6, 16
+; SI-NEXT:    v_alignbit_b32 v53, s11, v14, 16
+; SI-NEXT:    v_mov_b32_e32 v14, v22
+; SI-NEXT:    v_alignbit_b32 v52, s9, v18, 16
 ; SI-NEXT:    v_mov_b32_e32 v18, v23
-; SI-NEXT:    v_mov_b32_e32 v23, v35
-; SI-NEXT:    s_lshr_b32 s73, s11, 16
+; SI-NEXT:    v_alignbit_b32 v51, s16, v19, 16
+; SI-NEXT:    v_mov_b32_e32 v19, v24
+; SI-NEXT:    v_mov_b32_e32 v24, v26
+; SI-NEXT:    s_lshr_b32 s73, s63, 16
 ; SI-NEXT:    s_lshr_b32 s72, s5, 16
 ; SI-NEXT:    s_lshr_b32 s63, s41, 16
 ; SI-NEXT:    s_lshr_b32 s62, s43, 16
@@ -209697,85 +209582,87 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_lshr_b32 s56, vcc_lo, 16
 ; SI-NEXT:    s_lshr_b32 s47, vcc_hi, 16
 ; SI-NEXT:    s_lshr_b32 s46, s74, 16
-; SI-NEXT:    v_readlane_b32 s34, v62, 61
+; SI-NEXT:    v_readlane_b32 s25, v62, 62
+; SI-NEXT:    v_readlane_b32 s74, v62, 61
 ; SI-NEXT:    s_lshr_b32 s45, s75, 16
-; SI-NEXT:    v_readlane_b32 s6, v62, 62
 ; SI-NEXT:    s_lshr_b32 s44, s76, 16
-; SI-NEXT:    s_mov_b32 s11, s37
-; SI-NEXT:    v_readlane_b32 s37, v62, 57
-; SI-NEXT:    s_mov_b32 s76, s97
-; SI-NEXT:    s_mov_b32 s97, s4
-; SI-NEXT:    v_readlane_b32 s85, v62, 63
+; SI-NEXT:    s_mov_b32 s76, s37
+; SI-NEXT:    s_mov_b32 s37, s97
+; SI-NEXT:    s_mov_b32 s97, s21
+; SI-NEXT:    s_mov_b32 s21, s18
+; SI-NEXT:    s_mov_b32 s18, s17
+; SI-NEXT:    s_mov_b32 s85, s87
+; SI-NEXT:    s_mov_b32 s87, s30
+; SI-NEXT:    s_mov_b32 s17, s24
 ; SI-NEXT:    s_lshr_b32 s43, s77, 16
 ; SI-NEXT:    s_mov_b32 s77, s35
 ; SI-NEXT:    s_mov_b32 s35, s64
 ; SI-NEXT:    s_mov_b32 s64, s55
 ; SI-NEXT:    s_mov_b32 s55, s83
 ; SI-NEXT:    s_mov_b32 s83, s99
-; SI-NEXT:    s_mov_b32 s99, s38
-; SI-NEXT:    s_mov_b32 s38, s20
-; SI-NEXT:    s_mov_b32 s20, s21
-; SI-NEXT:    s_mov_b32 s21, s52
+; SI-NEXT:    s_mov_b32 s99, s89
+; SI-NEXT:    s_mov_b32 s89, s39
+; SI-NEXT:    s_mov_b32 s39, s95
+; SI-NEXT:    s_mov_b32 s95, s6
 ; SI-NEXT:    s_lshr_b32 s41, s78, 16
-; SI-NEXT:    s_mov_b32 s52, s82
-; SI-NEXT:    s_mov_b32 s82, s66
-; SI-NEXT:    s_mov_b32 s66, s96
-; SI-NEXT:    s_mov_b32 s96, s98
-; SI-NEXT:    s_mov_b32 s98, s89
-; SI-NEXT:    s_mov_b32 s89, s18
-; SI-NEXT:    s_mov_b32 s18, s30
-; SI-NEXT:    v_mov_b32_e32 v6, v19
-; SI-NEXT:    v_mov_b32_e32 v14, v21
+; SI-NEXT:    s_mov_b32 s24, s96
+; SI-NEXT:    s_mov_b32 s96, s4
 ; SI-NEXT:    s_cbranch_execnz .LBB97_3
 ; SI-NEXT:  .LBB97_2: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v36
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT:    v_mov_b32_e32 v6, v5
+; SI-NEXT:    v_mov_b32_e32 v5, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    s_add_i32 s4, s88, 3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s79, 8
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v37
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v34
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_or_b32_e32 v1, v25, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_add_i32 s4, s82, 3
-; SI-NEXT:    v_readlane_b32 s8, v62, 12
+; SI-NEXT:    v_readlane_b32 s4, v62, 11
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    v_readlane_b32 s5, v62, 10
+; SI-NEXT:    v_readlane_b32 s6, v62, 9
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s52, 8
-; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
+; SI-NEXT:    s_add_i32 s8, s6, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 11
+; SI-NEXT:    v_readlane_b32 s5, v62, 8
 ; SI-NEXT:    s_and_b32 s8, s8, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 24
 ; SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    s_or_b32 s5, s5, s8
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_add_i32 s70, s70, 3
+; SI-NEXT:    v_readlane_b32 s6, v62, 30
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 33
-; SI-NEXT:    s_add_i32 s70, s5, 3
 ; SI-NEXT:    s_and_b32 s5, s70, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s16, 8
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v39
+; SI-NEXT:    s_lshl_b32 s8, s6, 8
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v48
 ; SI-NEXT:    s_or_b32 s5, s8, s5
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    s_addk_i32 s5, 0x300
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; SI-NEXT:    v_or_b32_e32 v2, v49, v2
+; SI-NEXT:    v_or_b32_e32 v2, v39, v2
 ; SI-NEXT:    v_or_b32_e32 v2, s5, v2
 ; SI-NEXT:    s_add_i32 s5, s35, 3
-; SI-NEXT:    v_readlane_b32 s8, v62, 14
+; SI-NEXT:    v_readlane_b32 s6, v62, 13
 ; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_lshl_b32 s8, s6, 8
 ; SI-NEXT:    s_add_i32 s9, s77, 3
 ; SI-NEXT:    s_or_b32 s5, s8, s5
-; SI-NEXT:    v_readlane_b32 s8, v62, 13
+; SI-NEXT:    v_readlane_b32 s6, v62, 12
 ; SI-NEXT:    s_and_b32 s9, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s8, 24
+; SI-NEXT:    s_lshl_b32 s8, s6, 24
 ; SI-NEXT:    s_lshl_b32 s9, s9, 16
 ; SI-NEXT:    s_addk_i32 s5, 0x300
 ; SI-NEXT:    s_or_b32 s8, s8, s9
@@ -209783,9 +209670,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s5, s8, s5
 ; SI-NEXT:    s_add_i32 s79, s92, 3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x3000000, v1
-; SI-NEXT:    s_add_i32 s9, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s16, s4, 0x3000000
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x3000000, v2
-; SI-NEXT:    s_add_i32 s70, s5, 0x3000000
+; SI-NEXT:    s_add_i32 s9, s5, 0x3000000
 ; SI-NEXT:    s_and_b32 s4, s79, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s7, 8
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v33
@@ -209794,12 +209681,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v2, v30, v2
+; SI-NEXT:    v_or_b32_e32 v2, v49, v2
 ; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_add_i32 s4, s66, 3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v8, v9, v8
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    s_add_i32 s4, s24, 3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s55, 8
-; SI-NEXT:    s_add_i32 s8, s76, 3
+; SI-NEXT:    s_add_i32 s8, s37, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_and_b32 s8, s8, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
@@ -209808,8 +209699,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s52, s87, 3
-; SI-NEXT:    s_add_i32 s39, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s52, s98, 3
+; SI-NEXT:    s_add_i32 s11, s4, 0x3000000
 ; SI-NEXT:    s_and_b32 s4, s52, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s67, 8
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v38
@@ -209818,68 +209709,64 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v4, v48, v4
-; SI-NEXT:    s_add_i32 s30, s18, 3
+; SI-NEXT:    v_or_b32_e32 v4, v20, v4
+; SI-NEXT:    s_add_i32 s30, s87, 3
 ; SI-NEXT:    v_or_b32_e32 v4, s4, v4
 ; SI-NEXT:    s_and_b32 s4, s30, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s98, 8
-; SI-NEXT:    s_add_i32 s8, s6, 3
+; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_add_i32 s8, s68, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_and_b32 s8, s8, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s27, 24
+; SI-NEXT:    s_lshl_b32 s5, s28, 24
 ; SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s48, s4, 0x3000000
 ; SI-NEXT:    v_readlane_b32 s4, v62, 42
+; SI-NEXT:    v_mov_b32_e32 v22, v30
 ; SI-NEXT:    s_add_i32 s87, s4, 3
 ; SI-NEXT:    v_readlane_b32 s5, v62, 39
 ; SI-NEXT:    s_and_b32 s4, s87, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v32
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v22
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v5, v24, v5
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    v_readlane_b32 s4, v62, 30
+; SI-NEXT:    v_or_b32_e32 v7, v35, v7
+; SI-NEXT:    v_or_b32_e32 v7, s4, v7
+; SI-NEXT:    v_readlane_b32 s4, v62, 32
 ; SI-NEXT:    s_add_i32 s67, s4, 3
 ; SI-NEXT:    s_and_b32 s4, s67, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s86, 8
+; SI-NEXT:    s_lshl_b32 s5, s85, 8
 ; SI-NEXT:    s_add_i32 s8, s69, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_and_b32 s8, s8, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s34, 24
+; SI-NEXT:    s_lshl_b32 s5, s74, 24
 ; SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_add_i32 s50, s90, 3
+; SI-NEXT:    v_readlane_b32 s5, v62, 23
 ; SI-NEXT:    s_add_i32 s49, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 24
-; SI-NEXT:    v_mov_b32_e32 v25, v28
-; SI-NEXT:    s_add_i32 s50, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 22
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x3000000, v5
 ; SI-NEXT:    s_and_b32 s4, s50, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v25
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v5, v8, v5
-; SI-NEXT:    s_add_i32 s94, s89, 3
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
+; SI-NEXT:    s_add_i32 s94, s86, 3
+; SI-NEXT:    v_or_b32_e32 v8, s4, v8
 ; SI-NEXT:    s_and_b32 s4, s94, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s85, 8
+; SI-NEXT:    s_lshl_b32 s5, s38, 8
 ; SI-NEXT:    s_add_i32 s8, s71, 3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_and_b32 s8, s8, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
@@ -209889,27 +209776,25 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s5, s5, s8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s94, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 51
-; SI-NEXT:    v_mov_b32_e32 v21, v27
+; SI-NEXT:    v_readlane_b32 s4, v62, 52
 ; SI-NEXT:    s_add_i32 s18, s4, 3
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x3000000, v5
+; SI-NEXT:    v_readlane_b32 s5, v62, 51
 ; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s37, 8
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v21
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v5, v9, v5
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
+; SI-NEXT:    v_or_b32_e32 v9, v24, v9
+; SI-NEXT:    v_or_b32_e32 v9, s4, v9
 ; SI-NEXT:    v_readlane_b32 s4, v62, 45
 ; SI-NEXT:    s_add_i32 s98, s4, 3
 ; SI-NEXT:    v_readlane_b32 s5, v62, 43
+; SI-NEXT:    v_readlane_b32 s6, v62, 14
 ; SI-NEXT:    s_and_b32 s4, s98, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s8, s91, 3
+; SI-NEXT:    s_add_i32 s8, s6, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_and_b32 s8, s8, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
@@ -209920,33 +209805,27 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s53, s4, 0x3000000
 ; SI-NEXT:    v_readlane_b32 s4, v62, 41
-; SI-NEXT:    v_mov_b32_e32 v19, v26
 ; SI-NEXT:    s_add_i32 s86, s4, 3
 ; SI-NEXT:    v_readlane_b32 s5, v62, 38
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x3000000, v5
 ; SI-NEXT:    s_and_b32 s4, s86, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v19
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v5, v23, v5
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x3000000, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s4, v62, 29
+; SI-NEXT:    v_or_b32_e32 v10, v59, v10
+; SI-NEXT:    v_or_b32_e32 v10, s4, v10
+; SI-NEXT:    v_readlane_b32 s4, v62, 31
 ; SI-NEXT:    s_add_i32 s66, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 26
+; SI-NEXT:    v_readlane_b32 s5, v62, 27
 ; SI-NEXT:    s_and_b32 s4, s66, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s37, s38, 3
+; SI-NEXT:    s_add_i32 s37, s39, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 21
 ; SI-NEXT:    s_and_b32 s8, s37, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s5, 24
+; SI-NEXT:    s_lshl_b32 s5, s95, 24
 ; SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s8
@@ -209961,65 +209840,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    v_readlane_b32 s5, v62, 37
-; SI-NEXT:    v_readlane_b32 s6, v62, 34
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s71, s6, 3
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    s_add_i32 s71, s22, 3
 ; SI-NEXT:    s_and_b32 s8, s71, 0xff
 ; SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; SI-NEXT:    s_add_i32 s35, s99, 3
-; SI-NEXT:    s_and_b32 s6, s35, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 3, v13
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_mov_b32_e32 v31, s9
-; SI-NEXT:    v_mov_b32_e32 v39, s70
-; SI-NEXT:    v_add_i32_e32 v36, vcc, 0x3000000, v2
-; SI-NEXT:    v_mov_b32_e32 v28, s39
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x3000000, v4
-; SI-NEXT:    v_mov_b32_e32 v27, s48
-; SI-NEXT:    v_mov_b32_e32 v26, s49
-; SI-NEXT:    v_mov_b32_e32 v25, s94
-; SI-NEXT:    v_mov_b32_e32 v24, s53
-; SI-NEXT:    v_mov_b32_e32 v23, s36
-; SI-NEXT:    v_alignbit_b32 v42, v23, v10, 16
-; SI-NEXT:    v_alignbit_b32 v41, v24, v9, 16
-; SI-NEXT:    v_alignbit_b32 v40, v25, v8, 16
-; SI-NEXT:    v_alignbit_b32 v55, v26, v7, 16
-; SI-NEXT:    v_alignbit_b32 v54, v27, v4, 16
-; SI-NEXT:    v_alignbit_b32 v53, v28, v36, 16
-; SI-NEXT:    v_alignbit_b32 v52, v39, v1, 16
-; SI-NEXT:    v_alignbit_b32 v51, v31, v3, 16
-; SI-NEXT:    s_lshr_b32 s57, s36, 16
-; SI-NEXT:    s_lshr_b32 s56, s53, 16
-; SI-NEXT:    s_lshr_b32 s47, s94, 16
-; SI-NEXT:    s_lshr_b32 s46, s49, 16
-; SI-NEXT:    s_lshr_b32 s45, s48, 16
-; SI-NEXT:    s_lshr_b32 s44, s39, 16
-; SI-NEXT:    s_lshr_b32 s43, s70, 16
-; SI-NEXT:    s_lshr_b32 s41, s9, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v22, v5
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x3000000, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_or_b32_e32 v11, v32, v11
+; SI-NEXT:    v_or_b32_e32 v11, s4, v11
 ; SI-NEXT:    v_readlane_b32 s4, v62, 40
 ; SI-NEXT:    s_add_i32 s85, s4, 3
 ; SI-NEXT:    s_and_b32 s4, s85, 0xff
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 31
+; SI-NEXT:    v_readlane_b32 s5, v62, 33
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    s_lshl_b32 s5, s5, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s8, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 53
+; SI-NEXT:    v_readlane_b32 s4, v62, 54
 ; SI-NEXT:    s_add_i32 s17, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 52
+; SI-NEXT:    v_readlane_b32 s5, v62, 53
 ; SI-NEXT:    s_and_b32 s4, s17, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
@@ -210027,25 +209874,58 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    v_readlane_b32 s5, v62, 47
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
+; SI-NEXT:    s_and_b32 s6, s35, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    v_mov_b32_e32 v30, s16
+; SI-NEXT:    v_mov_b32_e32 v39, s9
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x3000000, v2
+; SI-NEXT:    v_mov_b32_e32 v28, s11
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x3000000, v4
+; SI-NEXT:    v_mov_b32_e32 v27, s48
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x3000000, v7
+; SI-NEXT:    v_mov_b32_e32 v26, s49
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x3000000, v8
+; SI-NEXT:    v_mov_b32_e32 v25, s94
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x3000000, v9
+; SI-NEXT:    v_mov_b32_e32 v24, s53
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x3000000, v10
+; SI-NEXT:    v_mov_b32_e32 v23, s36
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x3000000, v11
 ; SI-NEXT:    v_mov_b32_e32 v22, s8
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v43, v22, v11, 16
+; SI-NEXT:    v_alignbit_b32 v42, v23, v10, 16
+; SI-NEXT:    v_alignbit_b32 v41, v24, v9, 16
+; SI-NEXT:    v_alignbit_b32 v40, v25, v8, 16
+; SI-NEXT:    v_alignbit_b32 v55, v26, v7, 16
+; SI-NEXT:    v_alignbit_b32 v54, v27, v4, 16
+; SI-NEXT:    v_alignbit_b32 v53, v28, v2, 16
+; SI-NEXT:    v_alignbit_b32 v52, v39, v1, 16
+; SI-NEXT:    v_alignbit_b32 v51, v30, v3, 16
 ; SI-NEXT:    s_lshr_b32 s58, s8, 16
+; SI-NEXT:    s_lshr_b32 s57, s36, 16
+; SI-NEXT:    s_lshr_b32 s56, s53, 16
+; SI-NEXT:    s_lshr_b32 s47, s94, 16
+; SI-NEXT:    s_lshr_b32 s46, s49, 16
+; SI-NEXT:    s_lshr_b32 s45, s48, 16
+; SI-NEXT:    s_lshr_b32 s44, s11, 16
+; SI-NEXT:    s_lshr_b32 s43, s9, 16
+; SI-NEXT:    s_lshr_b32 s41, s16, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v5
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v20, v5
+; SI-NEXT:    v_or_b32_e32 v5, v12, v5
 ; SI-NEXT:    v_or_b32_e32 v5, s4, v5
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x3000000, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v13
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 3, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; SI-NEXT:    v_readlane_b32 s4, v62, 48
 ; SI-NEXT:    s_add_i32 s7, s4, 3
 ; SI-NEXT:    s_and_b32 s4, s7, 0xff
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s22, 24
+; SI-NEXT:    s_lshl_b32 s5, s89, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_or_b32 s4, s5, s4
@@ -210060,35 +209940,32 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_readlane_b32 s5, v62, 35
-; SI-NEXT:    v_readlane_b32 s6, v62, 27
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s55, s6, 3
-; SI-NEXT:    s_and_b32 s6, s55, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    v_mov_b32_e32 v20, s10
-; SI-NEXT:    v_alignbit_b32 v44, v20, v12, 16
-; SI-NEXT:    s_lshr_b32 s59, s10, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v5, v14, v5
+; SI-NEXT:    v_or_b32_e32 v5, v19, v5
 ; SI-NEXT:    v_or_b32_e32 v5, s4, v5
 ; SI-NEXT:    v_readlane_b32 s4, v62, 36
 ; SI-NEXT:    s_add_i32 s81, s4, 3
+; SI-NEXT:    v_readlane_b32 s5, v62, 35
+; SI-NEXT:    v_readlane_b32 s6, v62, 28
 ; SI-NEXT:    s_and_b32 s4, s81, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
+; SI-NEXT:    s_add_i32 s55, s6, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readlane_b32 s5, v62, 26
+; SI-NEXT:    s_and_b32 s6, s55, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s20, 24
+; SI-NEXT:    s_lshl_b32 s5, s5, 24
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s12, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 32
+; SI-NEXT:    v_readlane_b32 s4, v62, 34
 ; SI-NEXT:    s_add_i32 s69, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 28
+; SI-NEXT:    v_readlane_b32 s5, v62, 29
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 0x3000000, v5
 ; SI-NEXT:    s_and_b32 s4, s69, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v6
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v29
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    s_addk_i32 s4, 0x300
@@ -210096,10 +209973,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v5, v18, v5
 ; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    v_readlane_b32 s4, v62, 20
+; SI-NEXT:    v_readlane_b32 s4, v62, 22
 ; SI-NEXT:    s_add_i32 s34, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 19
-; SI-NEXT:    v_readlane_b32 s6, v62, 17
+; SI-NEXT:    v_readlane_b32 s5, v62, 21
+; SI-NEXT:    v_readlane_b32 s6, v62, 19
 ; SI-NEXT:    s_and_b32 s4, s34, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_add_i32 s92, s6, 3
@@ -210114,21 +209991,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_add_i32 s13, s4, 0x3000000
 ; SI-NEXT:    v_readlane_b32 s4, v62, 25
 ; SI-NEXT:    s_add_i32 s51, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 23
+; SI-NEXT:    v_readlane_b32 s5, v62, 24
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x3000000, v5
 ; SI-NEXT:    s_and_b32 s4, s51, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v29
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v21
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v5, v61, v5
+; SI-NEXT:    v_or_b32_e32 v5, v14, v5
 ; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    v_readlane_b32 s4, v62, 18
+; SI-NEXT:    v_readlane_b32 s4, v62, 20
 ; SI-NEXT:    s_add_i32 s95, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 16
+; SI-NEXT:    v_readlane_b32 s5, v62, 18
 ; SI-NEXT:    s_and_b32 s4, s95, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_add_i32 s6, s96, 3
@@ -210145,35 +210022,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s84, 8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
 ; SI-NEXT:    s_addk_i32 s4, 0x300
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v6, v60, v13
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v6, v6, v13
 ; SI-NEXT:    v_or_b32_e32 v6, s4, v6
 ; SI-NEXT:    s_add_i32 s4, s83, 3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s68, 8
+; SI-NEXT:    s_lshl_b32 s5, s25, 8
 ; SI-NEXT:    s_add_i32 s6, s64, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readlane_b32 s5, v62, 15
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s65, 24
+; SI-NEXT:    s_lshl_b32 s5, s5, 24
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s15, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 4
+; SI-NEXT:    v_readlane_b32 s4, v62, 2
 ; SI-NEXT:    s_add_i32 s4, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 3
-; SI-NEXT:    v_readlane_b32 s6, v62, 2
+; SI-NEXT:    v_readlane_b32 s5, v62, 1
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s6, s6, 3
+; SI-NEXT:    s_add_i32 s6, s26, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 1
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s5, 24
+; SI-NEXT:    s_lshl_b32 s5, s27, 24
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
@@ -210183,20 +210062,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_add_i32 s4, s4, 3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s29, 8
-; SI-NEXT:    s_add_i32 s6, s11, 3
+; SI-NEXT:    s_add_i32 s6, s76, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readlane_b32 s5, v62, 16
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s90, 24
+; SI-NEXT:    s_lshl_b32 s5, s5, 24
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s40, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 10
+; SI-NEXT:    v_readlane_b32 s4, v62, 7
 ; SI-NEXT:    s_add_i32 s4, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 15
-; SI-NEXT:    v_readlane_b32 s6, v62, 9
+; SI-NEXT:    v_readlane_b32 s5, v62, 17
+; SI-NEXT:    v_readlane_b32 s6, v62, 6
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_add_i32 s6, s6, 3
@@ -210208,16 +210088,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s21, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 8
-; SI-NEXT:    s_add_i32 s4, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 7
-; SI-NEXT:    v_readlane_b32 s6, v62, 6
+; SI-NEXT:    s_add_i32 s18, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s4, s20, 3
+; SI-NEXT:    v_readlane_b32 s5, v62, 5
+; SI-NEXT:    v_readlane_b32 s6, v62, 4
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_add_i32 s6, s6, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 5
+; SI-NEXT:    v_readlane_b32 s5, v62, 3
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    s_lshl_b32 s5, s5, 24
@@ -210226,7 +210105,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s42, s4, 0x3000000
-; SI-NEXT:    v_mov_b32_e32 v13, s21
+; SI-NEXT:    v_mov_b32_e32 v13, s18
+; SI-NEXT:    v_mov_b32_e32 v20, s10
 ; SI-NEXT:    v_mov_b32_e32 v19, s12
 ; SI-NEXT:    v_mov_b32_e32 v18, s13
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 0x3000000, v5
@@ -210238,16 +210118,20 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_alignbit_b32 v58, s40, v13, 16
 ; SI-NEXT:    v_alignbit_b32 v56, v6, v50, 16
 ; SI-NEXT:    v_alignbit_b32 v47, v5, v17, 16
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_alignbit_b32 v46, v18, v16, 16
 ; SI-NEXT:    v_alignbit_b32 v45, v19, v15, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v44, v20, v12, 16
 ; SI-NEXT:    s_lshr_b32 s73, s42, 16
 ; SI-NEXT:    s_lshr_b32 s72, s40, 16
 ; SI-NEXT:    s_lshr_b32 s63, s15, 16
 ; SI-NEXT:    s_lshr_b32 s62, s14, 16
 ; SI-NEXT:    s_lshr_b32 s61, s13, 16
 ; SI-NEXT:    s_lshr_b32 s60, s12, 16
+; SI-NEXT:    s_lshr_b32 s59, s10, 16
 ; SI-NEXT:  .LBB97_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
+; SI-NEXT:    s_and_b32 s4, s18, 0xffff
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v57
 ; SI-NEXT:    v_or_b32_e32 v5, s4, v5
 ; SI-NEXT:    s_and_b32 s4, s42, 0xffff
@@ -210256,7 +210140,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v6, s4
 ; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_and_b32 s4, s17, 0xffff
@@ -210414,9 +210297,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x64, v0
 ; SI-NEXT:    v_mov_b32_e32 v5, s4
 ; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v36
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v53
-; SI-NEXT:    s_and_b32 s4, s39, 0xffff
+; SI-NEXT:    s_and_b32 s4, s11, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s44, 16
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
@@ -210428,7 +210311,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v52
-; SI-NEXT:    s_and_b32 s4, s70, 0xffff
+; SI-NEXT:    s_and_b32 s4, s9, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s43, 16
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
@@ -210441,7 +210324,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v51
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
+; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s41, 16
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
@@ -210502,26 +210385,27 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_readlane_b32 s31, v63, 1
 ; SI-NEXT:    v_readlane_b32 s30, v63, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB97_4:
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v2, v58
-; SI-NEXT:    v_mov_b32_e32 v49, v57
-; SI-NEXT:    v_mov_b32_e32 v30, v56
-; SI-NEXT:    v_mov_b32_e32 v48, v46
-; SI-NEXT:    v_mov_b32_e32 v24, v45
-; SI-NEXT:    v_mov_b32_e32 v28, v36
-; SI-NEXT:    v_mov_b32_e32 v27, v35
-; SI-NEXT:    v_mov_b32_e32 v26, v34
-; SI-NEXT:    v_mov_b32_e32 v13, v21
-; SI-NEXT:    ; implicit-def: $sgpr21
+; SI-NEXT:    v_mov_b32_e32 v5, v13
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v25, v58
+; SI-NEXT:    v_mov_b32_e32 v48, v39
+; SI-NEXT:    v_mov_b32_e32 v39, v57
+; SI-NEXT:    v_mov_b32_e32 v49, v56
+; SI-NEXT:    v_mov_b32_e32 v20, v47
+; SI-NEXT:    v_mov_b32_e32 v30, v37
+; SI-NEXT:    v_mov_b32_e32 v36, v35
+; SI-NEXT:    v_mov_b32_e32 v35, v45
+; SI-NEXT:    v_mov_b32_e32 v27, v26
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v32, v23
+; SI-NEXT:    ; implicit-def: $sgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $sgpr42
 ; SI-NEXT:    ; implicit-def: $sgpr73
@@ -210573,17 +210457,17 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $sgpr48
 ; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr16
 ; SI-NEXT:    ; implicit-def: $sgpr41
 ; SI-NEXT:    s_branch .LBB97_2
 ;
@@ -210647,14 +210531,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:168
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:176
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v40, 8, v27
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
@@ -210666,42 +210549,46 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v29
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v27
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v29
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v10
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v12
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v10
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v24
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v12
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v26
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v28
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v30
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v31
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v32
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v34
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v35
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v36
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v37
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v38
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:184
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:192
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:200
@@ -210710,37 +210597,34 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; VI-NEXT:    buffer_load_ushort v6, off, s[0:3], s32 offset:232
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v26
-; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v28
+; VI-NEXT:    v_lshlrev_b32_e32 v45, 8, v22
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
-; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b32_e32 v24, 8, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v2
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v24, 8, v2
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -210759,11 +210643,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:312
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:328
@@ -210772,8 +210651,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:36
+; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v28, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
+; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 8, v0
 ; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:44
@@ -210782,45 +210665,47 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:76
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:116
 ; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:172
 ; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:180
 ; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:196
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:212
 ; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:252
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:260
 ; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:268
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:276
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:324
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:308
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:324
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
@@ -210830,50 +210715,46 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB97_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -210890,10 +210771,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -210917,7 +210799,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v17, v10
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
@@ -210934,43 +210815,38 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v40, v42
-; VI-NEXT:    v_mov_b32_e32 v42, v44
-; VI-NEXT:    v_mov_b32_e32 v44, v45
-; VI-NEXT:    v_mov_b32_e32 v45, v62
-; VI-NEXT:    v_or_b32_sdwa v2, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v3, v53, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v34, v24
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -210978,74 +210854,77 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v0, v48, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v45, v62
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v32, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v54, v22
+; VI-NEXT:    v_mov_b32_e32 v41, v24
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v33, v0
+; VI-NEXT:    v_mov_b32_e32 v34, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v37, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v50, v26
+; VI-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v55, v26
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v39, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v49, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v43, v27
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v51, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v53, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v52, v28
+; VI-NEXT:    v_mov_b32_e32 v35, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v53, v28
 ; VI-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v1, v47, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v55, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v33, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v35, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v41, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v63, v27
-; VI-NEXT:    v_mov_b32_e32 v46, v57
+; VI-NEXT:    v_mov_b32_e32 v36, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v36, v0
+; VI-NEXT:    v_mov_b32_e32 v56, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v56, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v61, v59
+; VI-NEXT:    v_mov_b32_e32 v58, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v61, v60
+; VI-NEXT:    v_mov_b32_e32 v60, v59
 ; VI-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -211057,53 +210936,55 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v1, v44, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v58, v0
+; VI-NEXT:    v_mov_b32_e32 v50, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v43, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v52, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v0, v59, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v60, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v46, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_mov_b32_e32 v54, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v63, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v47, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_mov_b32_e32 v57, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
@@ -211135,10 +211016,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    s_cbranch_execnz .LBB97_3
 ; VI-NEXT:  .LBB97_2: ; %cmp.true
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v59
+; VI-NEXT:    v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
@@ -211157,147 +211040,165 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_lshl_b32 s9, s19, 8
 ; VI-NEXT:    s_add_i32 s16, s16, 3
 ; VI-NEXT:    s_lshl_b32 s10, s17, 8
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v28, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v59
-; VI-NEXT:    v_or_b32_sdwa v25, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v62
-; VI-NEXT:    v_or_b32_sdwa v27, v63, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v45
-; VI-NEXT:    v_or_b32_sdwa v52, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v44
-; VI-NEXT:    v_or_b32_sdwa v26, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v45
+; VI-NEXT:    v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v42
-; VI-NEXT:    v_or_b32_sdwa v63, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v40
-; VI-NEXT:    v_or_b32_sdwa v43, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v61
+; VI-NEXT:    v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v60
 ; VI-NEXT:    v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v46
-; VI-NEXT:    v_or_b32_sdwa v24, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v61
+; VI-NEXT:    v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v48, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v48, vcc, 0x300, v48
 ; VI-NEXT:    v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 0x3000000, v24
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v38, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v38, vcc, 0x300, v38
 ; VI-NEXT:    v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v23, vcc, 0x3000000, v23
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v22, v34, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v36, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v50, vcc, 0x300, v36
+; VI-NEXT:    v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v50, vcc, 0x300, v50
 ; VI-NEXT:    v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x3000000, v22
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v21, v53, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v53, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v49, vcc, 0x300, v49
 ; VI-NEXT:    v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v20, vcc, 0x3000000, v20
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v37, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v37, vcc, 0x300, v37
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT:    v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v19, vcc, 0x3000000, v19
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_or_b32_sdwa v58, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x300, v58
-; VI-NEXT:    v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x300, v57
+; VI-NEXT:    v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v18, vcc, 0x3000000, v18
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x300, v34
 ; VI-NEXT:    v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v14, vcc, 0x3000000, v14
@@ -211306,78 +211207,67 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:    v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v35, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x300, v35
-; VI-NEXT:    v_or_b32_sdwa v13, v13, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v35, vcc, 0x300, v25
-; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x300, v59
-; VI-NEXT:    v_or_b32_sdwa v25, v43, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x300, v36
+; VI-NEXT:    v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v36, vcc, 0x300, v26
+; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x300, v52
+; VI-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v52, vcc, 0x300, v54
+; VI-NEXT:    v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, 0x3000000, v13
-; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x3000000, v25
-; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x3000000, v28
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x3000000, v21
+; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x3000000, v26
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT:    v_or_b32_sdwa v30, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x300, v51
 ; VI-NEXT:    v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x300, v1
-; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x300, v63
-; VI-NEXT:    v_or_b32_sdwa v26, v26, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v51, vcc, 0x300, v59
+; VI-NEXT:    v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 0x3000000, v12
-; VI-NEXT:    v_add_u32_e32 v26, vcc, 0x3000000, v26
-; VI-NEXT:    v_add_u32_e32 v30, vcc, 0x3000000, v30
+; VI-NEXT:    v_add_u32_e32 v25, vcc, 0x3000000, v25
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v57, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT:    v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT:    v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x300, v2
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -211401,14 +211291,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v41, vcc, 0x300, v10
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x300, v55
 ; VI-NEXT:    v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v39, vcc, 0x300, v52
-; VI-NEXT:    v_add_u32_e32 v52, vcc, 0x300, v53
-; VI-NEXT:    v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_or_b32_sdwa v27, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v39, vcc, 0x300, v53
+; VI-NEXT:    v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 0x3000000, v9
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 0x3000000, v10
-; VI-NEXT:    v_add_u32_e32 v21, vcc, 0x3000000, v21
 ; VI-NEXT:    v_add_u32_e32 v27, vcc, 0x3000000, v27
+; VI-NEXT:    v_add_u32_e32 v28, vcc, 0x3000000, v28
+; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x3000000, v29
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -211424,14 +211315,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_add_u32_e32 v42, vcc, 0x300, v42
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v42, vcc, 0x300, v11
-; VI-NEXT:    v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x300, v40
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v33, vcc, 0x300, v1
+; VI-NEXT:    v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 0x3000000, v8
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, 0x3000000, v11
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v30, vcc, 0x3000000, v30
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
@@ -211471,29 +211366,19 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT:    v_or_b32_sdwa v29, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v46, vcc, 0x300, v46
 ; VI-NEXT:    v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0x3000000, v5
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u32_e32 v34, vcc, 0x300, v2
-; VI-NEXT:    v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v29, vcc, 0x3000000, v29
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v4
 ; VI-NEXT:    v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v47, vcc, 3, v32
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x300, v4
 ; VI-NEXT:    v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0x3000000, v4
@@ -211560,38 +211445,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB97_4:
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v61, v59
-; VI-NEXT:    v_mov_b32_e32 v46, v57
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v40, v42
-; VI-NEXT:    v_mov_b32_e32 v42, v44
-; VI-NEXT:    v_mov_b32_e32 v44, v45
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v61, v60
+; VI-NEXT:    v_mov_b32_e32 v60, v59
 ; VI-NEXT:    v_mov_b32_e32 v45, v62
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v47, v4
-; VI-NEXT:    v_mov_b32_e32 v54, v3
-; VI-NEXT:    v_mov_b32_e32 v52, v28
-; VI-NEXT:    v_mov_b32_e32 v63, v27
-; VI-NEXT:    v_mov_b32_e32 v50, v26
-; VI-NEXT:    v_mov_b32_e32 v34, v24
+; VI-NEXT:    v_mov_b32_e32 v63, v3
+; VI-NEXT:    v_mov_b32_e32 v53, v28
+; VI-NEXT:    v_mov_b32_e32 v43, v27
+; VI-NEXT:    v_mov_b32_e32 v55, v26
+; VI-NEXT:    v_mov_b32_e32 v41, v24
+; VI-NEXT:    v_mov_b32_e32 v54, v22
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_branch .LBB97_2
@@ -211653,18 +211535,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 8, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 8, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 8, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 8, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 8, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 8, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 8, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
 ; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v43
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
@@ -211693,10 +211575,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v52
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v51
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
@@ -211708,7 +211590,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v39
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
@@ -211756,7 +211638,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
@@ -211783,23 +211665,23 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v15
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
-; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:312
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:320
@@ -211812,49 +211694,48 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
 ; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:44
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v15, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:76
 ; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:92
 ; GFX9-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:100
 ; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:140
 ; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:156
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(21)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:204
 ; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:228
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:236
 ; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
 ; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:276
 ; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:284
 ; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:316
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
@@ -211865,54 +211746,55 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(28)
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(36)
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(38)
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(41)
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
@@ -211922,7 +211804,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB97_2
@@ -211935,7 +211817,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -211972,10 +211854,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
@@ -211991,13 +211873,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v9, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -212005,7 +211887,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -212046,8 +211928,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v52, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v14, v1, 16, v0
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v50, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -212065,16 +211947,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v48, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v17, v17, 16, v1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_mov_b32_e32 v33, v46
+; GFX9-NEXT:    v_mov_b32_e32 v33, v45
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
@@ -212087,7 +211969,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -212096,7 +211978,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -212104,122 +211986,121 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_mov_b32_e32 v40, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v46, v0
+; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v35, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v35, v45
+; GFX9-NEXT:    v_mov_b32_e32 v45, v61
+; GFX9-NEXT:    v_mov_b32_e32 v61, v42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v38, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
+; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v54, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_mov_b32_e32 v41, v57
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v45, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v60, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_or_b32_sdwa v1, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v57, v59
+; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v56, v42
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB97_3
 ; GFX9-NEXT:  .LBB97_2:
 ; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v33, v46
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v33, v45
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v56, v61
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:  .LBB97_3: ; %Flow
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -212422,7 +212303,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -212482,11 +212363,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT:    v_or_b32_sdwa v48, v40, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v26
-; GFX9-NEXT:    v_or_b32_sdwa v49, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v26
@@ -212521,7 +212402,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_add_u32_e32 v24, 3, v24
-; GFX9-NEXT:    v_add_u32_e32 v26, 3, v62
+; GFX9-NEXT:    v_add_u32_e32 v26, 3, v61
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v36, 0x300, v24
 ; GFX9-NEXT:    v_add_u32_e32 v24, 0x300, v48
@@ -212530,7 +212411,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_add_u32_e32 v26, 3, v61
+; GFX9-NEXT:    v_add_u32_e32 v26, 3, v45
 ; GFX9-NEXT:    v_add_u32_e32 v20, 3, v20
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v34, 0x300, v20
@@ -212539,7 +212420,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_add_u32_e32 v26, 3, v56
 ; GFX9-NEXT:    v_add_u32_e32 v21, 3, v21
-; GFX9-NEXT:    v_or_b32_sdwa v21, v45, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u32_e32 v28, 0x300, v21
 ; GFX9-NEXT:    v_add_u32_e32 v21, 0x300, v54
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff, v21
@@ -216418,1031 +216299,1192 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v16
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v11
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v3
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v22
-; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v29
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
-; VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v27
-; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v26
-; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v24
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
-; VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v18
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v28
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v24
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v63, 16, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v30
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
+; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr60
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr58
-; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v37
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr32
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; kill: killed $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; kill: killed $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; kill: killed $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr46
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; kill: killed $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; kill: killed $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr44
 ; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB98_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v9
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v8
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v8
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v7
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v6
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v6
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v3
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v1
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v32
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[9:10]
-; VI-NEXT:    v_lshrrev_b64 v[45:46], 24, v[3:4]
-; VI-NEXT:    v_mov_b32_e32 v34, v47
-; VI-NEXT:    v_lshrrev_b64 v[46:47], 24, v[1:2]
-; VI-NEXT:    v_mov_b32_e32 v47, v34
-; VI-NEXT:    v_lshrrev_b64 v[34:35], 24, v[31:32]
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v34, v36
-; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; VI-NEXT:    v_lshrrev_b64 v[38:39], 24, v[25:26]
-; VI-NEXT:    v_mov_b32_e32 v36, v34
-; VI-NEXT:    v_mov_b32_e32 v34, v50
-; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[23:24]
-; VI-NEXT:    v_mov_b32_e32 v39, v62
-; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[21:22]
-; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[7:8]
-; VI-NEXT:    v_mov_b32_e32 v50, v34
-; VI-NEXT:    v_mov_b32_e32 v62, v39
-; VI-NEXT:    v_mov_b32_e32 v34, v40
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[19:20]
-; VI-NEXT:    v_mov_b32_e32 v40, v34
-; VI-NEXT:    v_mov_b32_e32 v34, v43
-; VI-NEXT:    v_lshrrev_b64 v[42:43], 24, v[17:18]
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 24, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 24, v20
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 24, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 8, v18
-; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v17
-; VI-NEXT:    v_mov_b32_e32 v43, v34
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v16
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v16
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v15
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v14
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v12
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v11
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v10
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v10
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v9
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v8
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v8
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v6
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[15:16]
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[11:12]
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[31:32], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v31, v7
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v9, v10
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v9, v11
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v9, v12
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v9, v13
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v9, v14
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v9, v16
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v9, v8
+; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v7, v6
+; VI-NEXT:    v_lshrrev_b64 v[5:6], 24, v[5:6]
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v5, v3
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v3, v1
+; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[1:2]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 24, v37
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v37
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v36
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v36
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v37
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[36:37]
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 24, v30
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v30
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v29
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v29
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v30
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[29:30]
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 24, v28
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v28
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v27
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v27
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v28
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[27:28]
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v26
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v25
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v25
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v26
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[25:26]
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v24
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v23
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v23
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v24
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v22
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v21
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v21
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v22
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v19
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v20
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v17
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[19:20]
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v18
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
+; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v1, v46
+; VI-NEXT:    v_lshrrev_b64 v[45:46], 24, v[17:18]
+; VI-NEXT:    v_mov_b32_e32 v32, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 24, v26
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v24
+; VI-NEXT:    v_lshrrev_b32_e32 v58, 24, v22
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 24, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v18
+; VI-NEXT:    v_lshrrev_b32_e32 v62, 8, v18
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v17
+; VI-NEXT:    v_mov_b32_e32 v46, v1
+; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    ; implicit-def: $vgpr5
+; VI-NEXT:    ; implicit-def: $vgpr7
+; VI-NEXT:    ; implicit-def: $vgpr9
+; VI-NEXT:    ; implicit-def: $vgpr11
+; VI-NEXT:    ; implicit-def: $vgpr13
+; VI-NEXT:    ; implicit-def: $vgpr15
+; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr21
+; VI-NEXT:    ; implicit-def: $vgpr23
+; VI-NEXT:    ; implicit-def: $vgpr25
+; VI-NEXT:    ; implicit-def: $vgpr27
+; VI-NEXT:    ; implicit-def: $vgpr29
+; VI-NEXT:    ; implicit-def: $vgpr37
 ; VI-NEXT:  .LBB98_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB98_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v33, 3
-; VI-NEXT:    v_add_u16_e32 v34, 3, v18
-; VI-NEXT:    v_add_u16_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; VI-NEXT:    v_or_b32_e32 v35, v34, v18
-; VI-NEXT:    v_add_u16_e32 v18, 3, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v17, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_or_b32_e32 v34, v18, v17
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_sdwa v17, v20, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v34, 3, v20
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_or_b32_e32 v35, v34, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v19, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v20, 3, v19
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_or_b32_e32 v34, v20, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v19, 3, v22
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_or_b32_e32 v61, v19, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v21, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v22, 3, v21
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_or_b32_e32 v60, v22, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v24, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v19, 3, v24
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_or_b32_e32 v63, v19, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v23, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v24, 3, v23
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_or_b32_e32 v62, v24, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v19, 3, v26
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_or_b32_e32 v58, v19, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v26, 3, v25
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_or_b32_e32 v57, v26, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v28, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v19, 3, v28
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_or_b32_e32 v56, v19, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v28, 3, v27
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_or_b32_e32 v55, v28, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v21, 3, v30
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_sdwa v19, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    v_add_u16_e32 v30, 3, v29
-; VI-NEXT:    v_or_b32_e32 v40, v21, v17
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v19
-; VI-NEXT:    v_or_b32_e32 v39, v30, v17
-; VI-NEXT:    v_add_u16_sdwa v17, v32, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v45, 3, v32
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_sdwa v19, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v31, 3
+; VI-NEXT:    v_add_u16_sdwa v51, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_e32 v32, 3, v18
+; VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v51
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v18, v32, v18
+; VI-NEXT:    v_add_u16_e32 v32, 3, v17
+; VI-NEXT:    v_add_u16_sdwa v17, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v32, 3, v31
-; VI-NEXT:    v_or_b32_e32 v38, v45, v17
-; VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v19
-; VI-NEXT:    v_add_u16_sdwa v21, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    v_or_b32_e32 v37, v32, v17
-; VI-NEXT:    v_add_u16_e32 v17, 3, v2
-; VI-NEXT:    v_add_u16_e32 v2, 3, v1
-; VI-NEXT:    v_add_u16_sdwa v19, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
-; VI-NEXT:    v_or_b32_e32 v49, v17, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
-; VI-NEXT:    v_or_b32_e32 v48, v2, v1
-; VI-NEXT:    v_add_u16_sdwa v1, v4, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v19, 3, v4
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v4, 3, v3
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_or_b32_e32 v47, v19, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT:    v_or_b32_e32 v46, v4, v1
-; VI-NEXT:    v_add_u16_sdwa v1, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v21, 3, v6
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_sdwa v3, v5, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_add_u16_e32 v6, 3, v5
-; VI-NEXT:    v_or_b32_e32 v44, v21, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT:    v_or_b32_e32 v43, v6, v1
-; VI-NEXT:    v_add_u16_sdwa v1, v8, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v23, 3, v8
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_sdwa v3, v7, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_add_u16_e32 v8, 3, v7
-; VI-NEXT:    v_or_b32_e32 v42, v23, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT:    v_or_b32_e32 v41, v8, v1
-; VI-NEXT:    v_add_u16_sdwa v1, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v25, 3, v10
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_sdwa v3, v9, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_add_u16_e32 v10, 3, v9
-; VI-NEXT:    v_or_b32_e32 v54, v25, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT:    v_add_u16_sdwa v52, v12, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_or_b32_e32 v53, v10, v1
-; VI-NEXT:    v_add_u16_e32 v27, 3, v12
-; VI-NEXT:    v_add_u16_sdwa v3, v11, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v52
-; VI-NEXT:    v_add_u16_e32 v12, 3, v11
-; VI-NEXT:    v_or_b32_e32 v51, v27, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT:    v_or_b32_e32 v50, v12, v1
-; VI-NEXT:    v_add_u16_sdwa v1, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_e32 v29, 3, v14
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    v_add_u16_sdwa v3, v13, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_add_u16_e32 v14, 3, v13
-; VI-NEXT:    v_or_b32_e32 v36, v29, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT:    v_add_u16_sdwa v59, v16, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_or_b32_e32 v35, v14, v1
-; VI-NEXT:    v_add_u16_sdwa v3, v15, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v16, 3, v16
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v59
-; VI-NEXT:    v_add_u16_e32 v15, 3, v15
-; VI-NEXT:    v_or_b32_e32 v34, v16, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT:    v_or_b32_e32 v33, v15, v1
-; VI-NEXT:    v_mov_b32_e32 v31, v32
-; VI-NEXT:    v_mov_b32_e32 v32, v45
-; VI-NEXT:    v_lshrrev_b32_e32 v45, 8, v34
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v1, v2
-; VI-NEXT:    v_mov_b32_e32 v2, v17
-; VI-NEXT:    v_mov_b32_e32 v3, v4
-; VI-NEXT:    v_mov_b32_e32 v4, v19
-; VI-NEXT:    v_mov_b32_e32 v5, v6
-; VI-NEXT:    v_mov_b32_e32 v6, v21
-; VI-NEXT:    v_mov_b32_e32 v7, v8
-; VI-NEXT:    v_mov_b32_e32 v8, v23
-; VI-NEXT:    v_mov_b32_e32 v9, v10
-; VI-NEXT:    v_mov_b32_e32 v10, v25
-; VI-NEXT:    v_mov_b32_e32 v11, v12
-; VI-NEXT:    v_mov_b32_e32 v12, v27
-; VI-NEXT:    v_mov_b32_e32 v13, v14
-; VI-NEXT:    v_mov_b32_e32 v14, v29
-; VI-NEXT:    v_mov_b32_e32 v17, v18
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v19, v20
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v21, v22
-; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v23, v24
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v25, v26
-; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v27, v28
-; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v29, v30
-; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v45, 8, v33
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[33:34]
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v36
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v35
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[35:36]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v51
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v50
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[50:51]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v54
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v53
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[53:54]
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v42
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v41
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v44
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v43
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v47
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v46
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v49
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v48
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v17, v32, v17
+; VI-NEXT:    v_add_u16_e32 v32, 3, v20
+; VI-NEXT:    v_add_u16_sdwa v20, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v20, v32, v20
+; VI-NEXT:    v_add_u16_e32 v32, 3, v19
+; VI-NEXT:    v_add_u16_sdwa v19, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; VI-NEXT:    v_add_u16_sdwa v48, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v19, v32, v19
+; VI-NEXT:    v_add_u16_e32 v32, 3, v22
+; VI-NEXT:    v_lshlrev_b32_e32 v22, 16, v48
+; VI-NEXT:    v_add_u16_sdwa v53, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v22, v32, v22
+; VI-NEXT:    v_add_u16_e32 v32, 3, v21
+; VI-NEXT:    v_lshlrev_b32_e32 v21, 16, v53
+; VI-NEXT:    v_add_u16_sdwa v61, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v21, v32, v21
+; VI-NEXT:    v_add_u16_e32 v32, 3, v24
+; VI-NEXT:    v_lshlrev_b32_e32 v24, 16, v61
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v24, v32, v24
+; VI-NEXT:    v_add_u16_e32 v32, 3, v23
+; VI-NEXT:    v_add_u16_sdwa v23, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; VI-NEXT:    v_add_u16_sdwa v58, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v23, v32, v23
+; VI-NEXT:    v_add_u16_e32 v32, 3, v26
+; VI-NEXT:    v_lshlrev_b32_e32 v26, 16, v58
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v26, v32, v26
+; VI-NEXT:    v_add_u16_e32 v32, 3, v25
+; VI-NEXT:    v_add_u16_sdwa v25, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; VI-NEXT:    v_add_u16_sdwa v39, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v25, v32, v25
+; VI-NEXT:    v_add_u16_e32 v32, 3, v28
+; VI-NEXT:    v_lshlrev_b32_e32 v28, 16, v39
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v28, v32, v28
+; VI-NEXT:    v_add_u16_e32 v32, 3, v27
+; VI-NEXT:    v_add_u16_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; VI-NEXT:    v_add_u16_sdwa v35, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v27, v32, v27
+; VI-NEXT:    v_add_u16_e32 v33, 3, v30
+; VI-NEXT:    v_add_u16_e32 v34, 3, v29
+; VI-NEXT:    v_add_u16_sdwa v32, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v35
+; VI-NEXT:    v_add_u16_sdwa v52, v37, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v30, v33, v29
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v29, 16, v32
+; VI-NEXT:    v_add_u16_e32 v33, 3, v37
+; VI-NEXT:    v_add_u16_sdwa v50, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v32, 16, v52
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v38
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[43:44]
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v37
-; VI-NEXT:    v_lshrrev_b64 v[45:46], 24, v[46:47]
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[34:35], 24, v[37:38]
-; VI-NEXT:    v_lshrrev_b64 v[46:47], 24, v[48:49]
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v40
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v39
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[34:35], 24, v[39:40]
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v56
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v55
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v58
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v57
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v63
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v62
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v61
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v60
-; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[55:56]
-; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[62:63]
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[60:61]
-; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b64 v[38:39], 24, v[57:58]
-; VI-NEXT:    v_mov_b32_e32 v57, v52
-; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[41:42]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v55
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v54
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[54:55]
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v34, v59, 8, 8
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v54
+; VI-NEXT:    v_or_b32_e32 v29, v34, v29
+; VI-NEXT:    v_add_u16_e32 v34, 3, v36
+; VI-NEXT:    v_or_b32_e32 v37, v33, v32
+; VI-NEXT:    v_lshlrev_b32_e32 v32, 16, v50
+; VI-NEXT:    v_add_u16_sdwa v57, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v36, v34, v32
+; VI-NEXT:    v_add_u16_e32 v33, 3, v2
+; VI-NEXT:    v_add_u16_e32 v34, 3, v1
+; VI-NEXT:    v_add_u16_sdwa v32, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v57
+; VI-NEXT:    v_or_b32_e32 v2, v33, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
+; VI-NEXT:    v_add_u16_sdwa v56, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v1, v34, v1
+; VI-NEXT:    v_add_u16_e32 v33, 3, v4
+; VI-NEXT:    v_add_u16_e32 v34, 3, v3
+; VI-NEXT:    v_add_u16_sdwa v32, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
+; VI-NEXT:    v_or_b32_e32 v4, v33, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v32
+; VI-NEXT:    v_add_u16_sdwa v47, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v3, v34, v3
+; VI-NEXT:    v_add_u16_e32 v33, 3, v6
+; VI-NEXT:    v_add_u16_e32 v34, 3, v5
+; VI-NEXT:    v_add_u16_sdwa v32, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v47
+; VI-NEXT:    v_or_b32_e32 v6, v33, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v32
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v5, v34, v5
+; VI-NEXT:    v_add_u16_sdwa v34, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_u16_e32 v38, 3, v8
+; VI-NEXT:    v_add_u16_e32 v33, 3, v7
+; VI-NEXT:    v_add_u16_sdwa v32, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v34
+; VI-NEXT:    v_or_b32_e32 v8, v38, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v32
+; VI-NEXT:    v_add_u16_sdwa v59, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v7, v33, v7
+; VI-NEXT:    v_add_u16_e32 v33, 3, v10
+; VI-NEXT:    v_add_u16_e32 v38, 3, v9
+; VI-NEXT:    v_add_u16_sdwa v32, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v59
+; VI-NEXT:    v_or_b32_e32 v10, v33, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v32
+; VI-NEXT:    v_add_u16_sdwa v63, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v9, v38, v9
+; VI-NEXT:    v_add_u16_e32 v33, 3, v12
+; VI-NEXT:    v_add_u16_e32 v38, 3, v11
+; VI-NEXT:    v_add_u16_sdwa v32, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v63
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v12, v33, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v32
+; VI-NEXT:    v_add_u16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v11, v38, v11
+; VI-NEXT:    v_add_u16_e32 v38, 3, v14
+; VI-NEXT:    v_add_u16_e32 v49, 3, v13
+; VI-NEXT:    v_add_u16_sdwa v32, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v33
+; VI-NEXT:    v_add_u16_sdwa v60, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v14, v38, v13
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v32
+; VI-NEXT:    v_add_u16_sdwa v31, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_e32 v16, 3, v16
+; VI-NEXT:    v_add_u16_e32 v32, 3, v15
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v60
+; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v16, v16, v15
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v31
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v15, v32, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v16
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v15
+; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[15:16]
+; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_or_b32_e32 v13, v49, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 8, v14
+; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 8, v13
+; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v11
+; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[11:12]
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v9
+; VI-NEXT:    v_lshrrev_b64 v[9:10], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
+; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v6
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v5
+; VI-NEXT:    v_lshrrev_b64 v[5:6], 24, v[5:6]
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[1:2]
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v37
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v36
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[36:37]
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v30
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v29
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[29:30]
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v28
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v27
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[27:28]
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v26
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v25
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[25:26]
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v24
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v23
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v22
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v21
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v60, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v33, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v63, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v59, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v34, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v47, 8, 8
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[19:20]
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v56, 8, 8
+; VI-NEXT:    v_lshrrev_b64 v[45:46], 24, v[17:18]
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v57, 8, 8
+; VI-NEXT:    v_mov_b32_e32 v46, v35
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v52, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v46, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v39, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v49, v53
+; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v52, v51
+; VI-NEXT:    v_bfe_u32 v31, v51, 8, 8
+; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
+; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v62, 8, v18
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v17
+; VI-NEXT:    v_bfe_u32 v35, v58, 8, 8
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v39, v61, 8, 8
+; VI-NEXT:    v_bfe_u32 v58, v48, 8, 8
+; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshrrev_b64 v[42:43], 24, v[54:55]
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 8, v55
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_bfe_u32 v34, v59, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v34, v52, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v34, v62, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v52, v34
-; VI-NEXT:    v_bfe_u32 v34, v34, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v54, v34
-; VI-NEXT:    v_bfe_u32 v34, v34, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v51, v34
-; VI-NEXT:    v_bfe_u32 v34, v34, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v56, v34
-; VI-NEXT:    v_bfe_u32 v60, v34, 8, 8
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bfe_u32 v34, v34, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v34, v36, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v34, v50, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v34, v53, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v34, v40, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bfe_u32 v34, v34, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_bfe_u32 v48, v34, 8, 8
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v63, v34
-; VI-NEXT:    v_bfe_u32 v55, v34, 8, 8
+; VI-NEXT:    v_bfe_u32 v61, v53, 8, 8
 ; VI-NEXT:  .LBB98_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v43
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v34, 8, v34
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v34, 8, v34
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e32 v34, 8, v46
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v34, v46, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v60
-; VI-NEXT:    v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v45
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v40
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v44
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v33
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 52, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 56, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 60, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v42
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v55
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v45
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v58
-; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v37
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v55
-; VI-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v62
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v31
+; VI-NEXT:    v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v39
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v38
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v44
+; VI-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v48
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v42
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v61
+; VI-NEXT:    v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v61
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v41
+; VI-NEXT:    v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v58
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v49
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v54
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v39
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v38
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v35
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v35
-; VI-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x6c, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x74, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -219804,8 +219846,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s82, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v20
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v24
 ; SI-NEXT:    s_waitcnt vmcnt(14)
@@ -219836,7 +219878,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v43
 ; SI-NEXT:    v_writelane_b32 v62, s6, 0
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB99_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
@@ -219848,7 +219892,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v1, s56
 ; SI-NEXT:    s_and_b32 s4, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s21, 16
-; SI-NEXT:    v_alignbit_b32 v5, s57, v1, 24
+; SI-NEXT:    v_alignbit_b32 v8, s57, v1, 24
 ; SI-NEXT:    v_alignbit_b32 v50, s57, v1, 16
 ; SI-NEXT:    v_alignbit_b32 v1, s57, v1, 8
 ; SI-NEXT:    s_or_b32 s46, s4, s5
@@ -219860,43 +219904,39 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_mov_b32_e32 v1, s46
 ; SI-NEXT:    s_and_b32 s4, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s25, 16
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v5, s47, v1, 24
+; SI-NEXT:    v_alignbit_b32 v8, s47, v1, 24
 ; SI-NEXT:    s_or_b32 s44, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s27, 16
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v5, s47, v1, 16
+; SI-NEXT:    v_alignbit_b32 v8, s47, v1, 16
 ; SI-NEXT:    v_alignbit_b32 v51, s47, v1, 8
 ; SI-NEXT:    s_or_b32 s45, s4, s5
 ; SI-NEXT:    v_mov_b32_e32 v1, s44
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v5, s45, v1, 24
 ; SI-NEXT:    s_and_b32 s4, s28, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v5, s45, v1, 16
-; SI-NEXT:    v_alignbit_b32 v1, s45, v1, 8
+; SI-NEXT:    v_alignbit_b32 v8, s45, v1, 24
 ; SI-NEXT:    s_or_b32 s42, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s82, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s81, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_or_b32 s43, s4, s5
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v8, s45, v1, 16
+; SI-NEXT:    v_alignbit_b32 v49, s45, v1, 8
+; SI-NEXT:    s_or_b32 s43, s4, s5
 ; SI-NEXT:    v_mov_b32_e32 v1, s42
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v5, s43, v1, 24
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v5, s43, v1, 16
-; SI-NEXT:    v_alignbit_b32 v1, s43, v1, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v8, s43, v1, 24
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v8, s43, v1, 16
+; SI-NEXT:    v_alignbit_b32 v48, s43, v1, 8
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
 ; SI-NEXT:    s_and_b32 s4, s85, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s84, 16
@@ -219946,7 +219986,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_or_b32 s10, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s75, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s74, 16
-; SI-NEXT:    v_or_b32_e32 v12, v1, v6
+; SI-NEXT:    v_or_b32_e32 v12, v1, v5
 ; SI-NEXT:    s_or_b32 s9, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s77, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s76, 16
@@ -219970,7 +220010,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
 ; SI-NEXT:    v_writelane_b32 v62, s4, 1
 ; SI-NEXT:    s_lshr_b32 s4, s10, 8
-; SI-NEXT:    v_or_b32_e32 v10, v1, v8
+; SI-NEXT:    v_or_b32_e32 v10, v1, v6
 ; SI-NEXT:    v_writelane_b32 v62, s4, 3
 ; SI-NEXT:    s_lshr_b32 s4, s9, 8
 ; SI-NEXT:    v_alignbit_b32 v1, s14, v10, 24
@@ -219991,34 +220031,32 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v19
 ; SI-NEXT:    v_writelane_b32 v62, s4, 15
 ; SI-NEXT:    s_and_b32 s4, s72, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v28, v8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v8, v1, v9
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v23
 ; SI-NEXT:    v_writelane_b32 v62, s4, 2
 ; SI-NEXT:    s_and_b32 s4, s74, 0xffff
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v5, v1, v13
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
 ; SI-NEXT:    v_writelane_b32 v62, s4, 5
 ; SI-NEXT:    s_and_b32 s4, s76, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v30, v13
+; SI-NEXT:    v_mov_b32_e32 v28, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v1, v17
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v31
 ; SI-NEXT:    v_writelane_b32 v62, s4, 8
 ; SI-NEXT:    s_and_b32 s4, s78, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v29, v9
+; SI-NEXT:    v_mov_b32_e32 v26, v9
 ; SI-NEXT:    v_or_b32_e32 v9, v1, v18
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
 ; SI-NEXT:    v_writelane_b32 v62, s4, 11
 ; SI-NEXT:    s_and_b32 s4, s88, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v26, v6
+; SI-NEXT:    v_mov_b32_e32 v25, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v1, v20
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v33
 ; SI-NEXT:    v_writelane_b32 v62, s4, 14
 ; SI-NEXT:    s_bfe_u32 s4, s74, 0x80008
-; SI-NEXT:    v_mov_b32_e32 v25, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v1, v21
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v34
 ; SI-NEXT:    v_writelane_b32 v62, s4, 4
@@ -220030,12 +220068,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v24
 ; SI-NEXT:    v_writelane_b32 v62, s4, 10
 ; SI-NEXT:    s_bfe_u32 s4, s88, 0x80008
-; SI-NEXT:    v_mov_b32_e32 v36, v17
-; SI-NEXT:    v_mov_b32_e32 v37, v18
-; SI-NEXT:    v_mov_b32_e32 v38, v20
-; SI-NEXT:    v_mov_b32_e32 v39, v21
-; SI-NEXT:    v_mov_b32_e32 v48, v22
-; SI-NEXT:    v_mov_b32_e32 v49, v24
+; SI-NEXT:    v_mov_b32_e32 v29, v17
+; SI-NEXT:    v_mov_b32_e32 v30, v18
+; SI-NEXT:    v_mov_b32_e32 v36, v20
+; SI-NEXT:    v_mov_b32_e32 v37, v21
+; SI-NEXT:    v_mov_b32_e32 v38, v22
+; SI-NEXT:    v_mov_b32_e32 v39, v24
 ; SI-NEXT:    s_lshr_b32 s68, s57, 8
 ; SI-NEXT:    s_lshr_b32 s65, s47, 8
 ; SI-NEXT:    s_lshr_b32 s54, s45, 8
@@ -220096,7 +220134,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_alignbit_b32 v54, s6, v1, 8
 ; SI-NEXT:    s_cbranch_execnz .LBB99_3
 ; SI-NEXT:  .LBB99_2: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    s_add_i32 s18, s18, 3
 ; SI-NEXT:    s_and_b32 s4, s18, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s88, 16
@@ -220208,54 +220248,50 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v22, s47, v21, 24
+; SI-NEXT:    s_lshr_b32 s4, s11, 8
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v22, s47, v21, 16
 ; SI-NEXT:    v_alignbit_b32 v51, s47, v21, 8
 ; SI-NEXT:    v_mov_b32_e32 v21, s44
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s45, v21, 24
-; SI-NEXT:    s_lshr_b32 s4, s11, 8
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s45, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s45, v21, 8
 ; SI-NEXT:    v_writelane_b32 v62, s4, 1
 ; SI-NEXT:    s_lshr_b32 s4, s10, 16
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v32
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v21, s42
+; SI-NEXT:    v_alignbit_b32 v22, s45, v21, 24
 ; SI-NEXT:    v_writelane_b32 v62, s4, 2
 ; SI-NEXT:    s_lshr_b32 s4, s10, 8
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v7
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_or_b32_e32 v3, v16, v3
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s43, v21, 24
+; SI-NEXT:    v_alignbit_b32 v22, s45, v21, 16
+; SI-NEXT:    v_alignbit_b32 v49, s45, v21, 8
+; SI-NEXT:    v_mov_b32_e32 v21, s42
 ; SI-NEXT:    v_writelane_b32 v62, s4, 3
 ; SI-NEXT:    s_lshr_b32 s4, s9, 24
-; SI-NEXT:    v_or_b32_e32 v5, v38, v5
+; SI-NEXT:    v_or_b32_e32 v5, v36, v5
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x30000, v3
 ; SI-NEXT:    v_mov_b32_e32 v3, s41
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s43, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s43, v21, 8
+; SI-NEXT:    v_alignbit_b32 v22, s43, v21, 24
 ; SI-NEXT:    v_writelane_b32 v62, s4, 4
 ; SI-NEXT:    s_lshr_b32 s4, s9, 16
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x30000, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v31
-; SI-NEXT:    v_or_b32_e32 v7, v25, v7
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v7, v14, v7
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v22, s43, v21, 16
+; SI-NEXT:    v_alignbit_b32 v48, s43, v21, 8
 ; SI-NEXT:    v_alignbit_b32 v21, v3, v16, 24
 ; SI-NEXT:    v_writelane_b32 v62, s4, 5
 ; SI-NEXT:    s_lshr_b32 s4, s9, 8
@@ -220269,7 +220305,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_alignbit_b32 v3, v3, v16, 8
 ; SI-NEXT:    v_writelane_b32 v62, s4, 6
 ; SI-NEXT:    s_lshr_b32 s4, s8, 24
-; SI-NEXT:    v_or_b32_e32 v5, v37, v5
+; SI-NEXT:    v_or_b32_e32 v5, v30, v5
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -220278,7 +220314,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    s_lshr_b32 s4, s8, 16
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x30000, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v27
-; SI-NEXT:    v_or_b32_e32 v11, v26, v11
+; SI-NEXT:    v_or_b32_e32 v11, v12, v11
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v3, v7, v14, 16
@@ -220293,7 +220329,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_alignbit_b32 v3, v7, v14, 8
 ; SI-NEXT:    v_writelane_b32 v62, s4, 9
 ; SI-NEXT:    s_lshr_b32 s4, s7, 24
-; SI-NEXT:    v_or_b32_e32 v5, v36, v5
+; SI-NEXT:    v_or_b32_e32 v5, v29, v5
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -220306,7 +220342,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x30000, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v23
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v19
-; SI-NEXT:    v_or_b32_e32 v10, v28, v10
+; SI-NEXT:    v_or_b32_e32 v10, v25, v10
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v3, v11, v12, 16
@@ -220324,11 +220360,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_alignbit_b32 v3, v11, v12, 8
 ; SI-NEXT:    v_writelane_b32 v62, s4, 12
 ; SI-NEXT:    s_lshr_b32 s4, s6, 24
-; SI-NEXT:    v_or_b32_e32 v1, v49, v1
-; SI-NEXT:    v_or_b32_e32 v2, v48, v2
-; SI-NEXT:    v_or_b32_e32 v4, v39, v4
-; SI-NEXT:    v_or_b32_e32 v5, v30, v5
-; SI-NEXT:    v_or_b32_e32 v8, v29, v8
+; SI-NEXT:    v_or_b32_e32 v1, v39, v1
+; SI-NEXT:    v_or_b32_e32 v2, v38, v2
+; SI-NEXT:    v_or_b32_e32 v4, v37, v4
+; SI-NEXT:    v_or_b32_e32 v5, v28, v5
+; SI-NEXT:    v_or_b32_e32 v8, v26, v8
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v3, v15, v10, 24
@@ -220496,64 +220532,65 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_mov_b32_e32 v7, s4
 ; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    s_and_b32 s4, s44, 0xff
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v49
 ; SI-NEXT:    v_or_b32_e32 v3, s4, v3
 ; SI-NEXT:    s_and_b32 s4, s45, 0xff
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_and_b32 s5, s66, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_lshl_b32 s5, s51, 8
+; SI-NEXT:    s_lshl_b32 s16, s67, 24
+; SI-NEXT:    v_readlane_b32 s67, v63, 19
+; SI-NEXT:    v_readlane_b32 s66, v63, 18
+; SI-NEXT:    v_readlane_b32 s51, v63, 11
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT:    v_or_b32_e32 v7, v11, v7
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_mov_b32_e32 v7, s4
 ; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_and_b32 s4, s42, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s51, 8
-; SI-NEXT:    s_lshl_b32 s16, s67, 24
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v48
 ; SI-NEXT:    v_or_b32_e32 v3, s4, v3
 ; SI-NEXT:    s_and_b32 s4, s43, 0xff
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_and_b32 s5, s55, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_lshl_b32 s5, s48, 8
+; SI-NEXT:    s_lshl_b32 s16, s64, 24
+; SI-NEXT:    v_readlane_b32 s64, v63, 16
+; SI-NEXT:    v_readlane_b32 s55, v63, 15
+; SI-NEXT:    v_readlane_b32 s48, v63, 8
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT:    v_or_b32_e32 v7, v11, v7
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
@@ -220563,21 +220600,16 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v16
 ; SI-NEXT:    s_and_b32 s4, s41, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s48, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_and_b32 s5, s52, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s64, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s16, s5
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_lshl_b32 s5, s37, 8
 ; SI-NEXT:    s_lshl_b32 s16, s53, 24
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
 ; SI-NEXT:    v_readlane_b32 s53, v63, 13
 ; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
 ; SI-NEXT:    v_readlane_b32 s37, v63, 5
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -220937,62 +220969,55 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr6
 ; SI-NEXT:    ; implicit-def: $vcc_lo
-; SI-NEXT:    v_mov_b32_e32 v49, v24
+; SI-NEXT:    v_mov_b32_e32 v39, v24
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    v_mov_b32_e32 v48, v22
+; SI-NEXT:    v_mov_b32_e32 v38, v22
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    v_mov_b32_e32 v39, v21
+; SI-NEXT:    v_mov_b32_e32 v37, v21
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    v_mov_b32_e32 v38, v20
+; SI-NEXT:    v_mov_b32_e32 v36, v20
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    v_mov_b32_e32 v37, v18
+; SI-NEXT:    v_mov_b32_e32 v30, v18
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    v_mov_b32_e32 v36, v17
+; SI-NEXT:    v_mov_b32_e32 v29, v17
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    v_mov_b32_e32 v30, v13
+; SI-NEXT:    v_mov_b32_e32 v28, v13
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    v_mov_b32_e32 v29, v9
+; SI-NEXT:    v_mov_b32_e32 v26, v9
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    v_mov_b32_e32 v28, v8
+; SI-NEXT:    v_mov_b32_e32 v25, v6
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    v_mov_b32_e32 v26, v6
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    v_mov_b32_e32 v25, v4
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; kill: killed $vcc_lo
 ; SI-NEXT:    ; implicit-def: $vcc_lo
 ; SI-NEXT:    ; implicit-def: $sgpr56
@@ -221008,11 +221033,13 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    ; implicit-def: $sgpr69
 ; SI-NEXT:    ; implicit-def: $sgpr80
 ; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $sgpr45
 ; SI-NEXT:    ; implicit-def: $sgpr54
 ; SI-NEXT:    ; implicit-def: $sgpr66
 ; SI-NEXT:    ; implicit-def: $sgpr70
 ; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $sgpr43
 ; SI-NEXT:    ; implicit-def: $sgpr51
 ; SI-NEXT:    ; implicit-def: $sgpr55
@@ -221061,12 +221088,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    ; implicit-def: $sgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; kill: killed $vcc_lo
 ; SI-NEXT:    ; implicit-def: $vcc_lo
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; kill: killed $sgpr6
 ; SI-NEXT:    ; kill: killed $vcc_lo
 ; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr24
@@ -221093,6 +221120,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -222689,7 +222717,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v36
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v13
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v36
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v22
@@ -222697,7 +222725,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v35
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v22
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v38
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v22
@@ -222996,7 +223024,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 22
 ; GFX9-NEXT:    v_mov_b32_e32 v60, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 23
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v17, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 24
 ; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
@@ -223004,7 +223032,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 25
 ; GFX9-NEXT:    v_mov_b32_e32 v23, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 26
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v17, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 27
 ; GFX9-NEXT:    v_mov_b32_e32 v59, s4
@@ -223274,14 +223302,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -229340,8 +229368,8 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76
 ; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v29
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v30
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -229365,431 +229393,403 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v18
 ; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v21
 ; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v22
 ; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v23
 ; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v24
 ; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v27
-; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v28
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v27
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s18
 ; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s22
 ; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s23
 ; SI-NEXT:    v_mul_f32_e64 v24, 1.0, s24
 ; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v27, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v27, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s29
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v28
 ; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v34
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v39
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v35
-; SI-NEXT:    s_waitcnt vmcnt(12) expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v50
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(11) expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v52
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v54
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v36
 ; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v37
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v41
 ; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v38
+; SI-NEXT:    s_waitcnt vmcnt(11) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v49
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v51
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v39
 ; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v48
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v49
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v51
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v53
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v55
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v40
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v42
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v43
+; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v55
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(6) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v41
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
+; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v50
+; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v52
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v53
+; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v54
+; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v40
+; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v42
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s29
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s28
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB101_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mov_b32_e32 v42, v51
+; SI-NEXT:    v_mov_b32_e32 v55, v50
+; SI-NEXT:    v_mov_b32_e32 v40, v52
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mov_b32_e32 v20, v44
-; SI-NEXT:    v_mov_b32_e32 v22, v21
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mov_b32_e32 v24, v47
 ; SI-NEXT:    v_mov_b32_e32 v23, v46
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_mov_b32_e32 v25, v56
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mov_b32_e32 v27, v57
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mov_b32_e32 v28, v26
-; SI-NEXT:    v_mov_b32_e32 v29, v58
+; SI-NEXT:    v_mov_b32_e32 v26, v57
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v45
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mov_b32_e32 v30, v14
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v29
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v36, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v38
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v39
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v33
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v60
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v35
+; SI-NEXT:    v_mov_b32_e32 v35, v43
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v61
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mov_b32_e32 v38, v10
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v39
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v63
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mov_b32_e32 v19, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v19, v28
+; SI-NEXT:    v_mov_b32_e32 v28, v14
+; SI-NEXT:    v_mov_b32_e32 v39, v22
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v44
-; SI-NEXT:    v_mov_b32_e32 v44, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v58
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v46
-; SI-NEXT:    v_mov_b32_e32 v46, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v63, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v47
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v6
-; SI-NEXT:    v_mov_b32_e32 v47, v2
+; SI-NEXT:    v_mov_b32_e32 v47, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v15
+; SI-NEXT:    v_mov_b32_e32 v15, v41
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v50
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v52
+; SI-NEXT:    v_mov_b32_e32 v51, v53
+; SI-NEXT:    v_mov_b32_e32 v53, v54
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v54
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
+; SI-NEXT:    v_mov_b32_e32 v20, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v57
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v48, v10
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
+; SI-NEXT:    v_mov_b32_e32 v37, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v26
-; SI-NEXT:    v_mov_b32_e32 v26, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v21, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v35
-; SI-NEXT:    v_mov_b32_e32 v35, v55
-; SI-NEXT:    v_mov_b32_e32 v55, v53
-; SI-NEXT:    v_mov_b32_e32 v34, v43
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
+; SI-NEXT:    v_mov_b32_e32 v27, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v9
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v13
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_mov_b32_e32 v32, v42
-; SI-NEXT:    v_mov_b32_e32 v42, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v51
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v31
-; SI-NEXT:    v_mov_b32_e32 v31, v36
-; SI-NEXT:    v_mov_b32_e32 v36, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v24
-; SI-NEXT:    v_mov_b32_e32 v24, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_mov_b32_e32 v38, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_mov_b32_e32 v40, v50
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v52
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v52
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v54
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    s_waitcnt vmcnt(6) expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_mov_b32_e32 v54, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v53
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v53
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v54
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v52
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
 ; SI-NEXT:    s_cbranch_execnz .LBB101_3
 ; SI-NEXT:  .LBB101_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v40
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v42
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v53
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v54
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v53
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v36
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v52
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v40
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v51
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v55
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v39
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v51
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v42
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v41
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v49
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v48
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v36
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -229798,10 +229798,10 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v30
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v29
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v29
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v28
@@ -229810,329 +229810,335 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v27
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v25
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v26
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v25
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v23
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v14
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, v63
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v14
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, v63
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, v61
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v60
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, v59
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v43
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v57
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v14
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v14
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v56
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v39
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v50
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v14
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v9
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v14
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v23
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v25
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v26
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v27
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v14
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v46
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v24
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v14
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v14
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v45
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v22
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v58
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v25
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v55
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v52
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v54
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v42
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v42
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v53
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v48
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v47
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v7
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v44
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v28
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v7
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v43
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v33
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v40
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v13
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v52
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v49
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v51
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v50
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v14
 ; SI-NEXT:  .LBB101_3: ; %end
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v35
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_or_b32_e32 v14, v14, v53
 ; SI-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v38
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v49
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
@@ -230148,7 +230154,7 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 16, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
@@ -230157,61 +230163,57 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v39
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v35
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v19
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 28, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v20
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 32, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v21
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 36, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v36
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 40, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v37
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 44, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -230219,13 +230221,13 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v47
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 48, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v59
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 52, v0
@@ -230234,21 +230236,25 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v60
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v44
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 56, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v61
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v46
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v62
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 64, v0
@@ -230259,16 +230265,14 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v63
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v47
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v44
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x44, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v46
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 0x48, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_or_b32_e32 v1, v14, v1
 ; SI-NEXT:    buffer_store_dword v1, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -230278,23 +230282,21 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v14, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v57
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v57
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v58
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
@@ -230303,11 +230305,13 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
@@ -230319,20 +230323,20 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v58
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
@@ -230344,35 +230348,33 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v33
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v54
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v34
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -230397,17 +230399,20 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB101_4:
-; SI-NEXT:    v_mov_b32_e32 v55, v53
-; SI-NEXT:    v_mov_b32_e32 v30, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v53, v54
+; SI-NEXT:    v_mov_b32_e32 v40, v52
+; SI-NEXT:    v_mov_b32_e32 v55, v50
+; SI-NEXT:    v_mov_b32_e32 v42, v51
+; SI-NEXT:    v_mov_b32_e32 v28, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -230431,75 +230436,70 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v40, v50
-; SI-NEXT:    v_mov_b32_e32 v42, v41
-; SI-NEXT:    v_mov_b32_e32 v36, v54
-; SI-NEXT:    v_mov_b32_e32 v29, v58
-; SI-NEXT:    v_mov_b32_e32 v28, v26
-; SI-NEXT:    v_mov_b32_e32 v27, v57
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    v_mov_b32_e32 v27, v58
+; SI-NEXT:    v_mov_b32_e32 v26, v57
 ; SI-NEXT:    v_mov_b32_e32 v25, v56
+; SI-NEXT:    v_mov_b32_e32 v24, v47
 ; SI-NEXT:    v_mov_b32_e32 v23, v46
-; SI-NEXT:    v_mov_b32_e32 v22, v21
-; SI-NEXT:    v_mov_b32_e32 v20, v44
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; kill: killed $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; kill: killed $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; kill: killed $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; kill: killed $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; kill: killed $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; kill: killed $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; kill: killed $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; kill: killed $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; kill: killed $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr43
@@ -233773,11 +233773,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v63, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v62
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v62, 0x38000000, v31
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v63
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v62, 0x38000000, v63
 ; SI-NEXT:    v_add_f32_e32 v63, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v61
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v58
@@ -233898,6 +233897,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -234138,9 +234138,9 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v63
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v62
 ; SI-NEXT:    v_add_f32_e32 v58, 0x38000000, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v62
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
@@ -234806,15 +234806,16 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:76
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v8
 ; SI-NEXT:    v_mov_b32_e32 v46, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
@@ -234827,599 +234828,588 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v46
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v30
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, s24
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v38
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v39
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v49
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v48
+; SI-NEXT:    s_waitcnt vmcnt(11) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
 ; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v50
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v62
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, v63
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, s19
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v63
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v50
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB103_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v57
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v23
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v24
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v44
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v25
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v59
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v29
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v30
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v38
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v27
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v37
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v35
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v27
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v38
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v48
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v31
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v49
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v35
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v32
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v50
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v39
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v24, v19
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v60
-; SI-NEXT:    v_mov_b32_e32 v39, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v33
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v36
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v51
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v40
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v20
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v43
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v43, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v43
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v28
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v56
-; SI-NEXT:    v_mov_b32_e32 v23, v17
-; SI-NEXT:    v_mov_b32_e32 v25, v20
-; SI-NEXT:    v_mov_b32_e32 v29, v21
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v50, v19
 ; SI-NEXT:    v_mov_b32_e32 v51, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v22
-; SI-NEXT:    v_mov_b32_e32 v48, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v45
-; SI-NEXT:    v_mov_b32_e32 v49, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v16
-; SI-NEXT:    v_mov_b32_e32 v35, v46
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v46
-; SI-NEXT:    v_mov_b32_e32 v43, v59
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v58
-; SI-NEXT:    v_mov_b32_e32 v32, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
+; SI-NEXT:    v_mov_b32_e32 v38, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; SI-NEXT:    v_mov_b32_e32 v37, v45
+; SI-NEXT:    v_mov_b32_e32 v27, v26
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v26
+; SI-NEXT:    v_mov_b32_e32 v49, v47
+; SI-NEXT:    v_mov_b32_e32 v35, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v59
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v62
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v63
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v50
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v62
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v13
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v14
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v31
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v17
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v18
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v19
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v46
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v32
+; SI-NEXT:    v_mov_b32_e32 v32, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
+; SI-NEXT:    v_mov_b32_e32 v33, v12
+; SI-NEXT:    v_mov_b32_e32 v34, v5
+; SI-NEXT:    v_mov_b32_e32 v58, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v61
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v63
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v44
+; SI-NEXT:    v_mov_b32_e32 v44, v18
+; SI-NEXT:    v_mov_b32_e32 v5, v43
+; SI-NEXT:    v_mov_b32_e32 v18, v6
 ; SI-NEXT:    s_branch .LBB103_3
 ; SI-NEXT:  .LBB103_2:
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    v_mov_b32_e32 v39, v30
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    v_mov_b32_e32 v32, v28
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    v_mov_b32_e32 v35, v46
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    v_mov_b32_e32 v49, v16
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    v_mov_b32_e32 v48, v26
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    v_mov_b32_e32 v51, v22
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    v_mov_b32_e32 v29, v21
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    v_mov_b32_e32 v25, v20
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    v_mov_b32_e32 v24, v19
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    v_mov_b32_e32 v23, v17
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; kill: killed $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    v_mov_b32_e32 v35, v28
+; SI-NEXT:    v_mov_b32_e32 v49, v47
+; SI-NEXT:    v_mov_b32_e32 v27, v26
+; SI-NEXT:    v_mov_b32_e32 v37, v45
+; SI-NEXT:    v_mov_b32_e32 v38, v16
+; SI-NEXT:    v_mov_b32_e32 v51, v22
+; SI-NEXT:    v_mov_b32_e32 v50, v19
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    v_mov_b32_e32 v43, v59
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    v_mov_b32_e32 v5, v6
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr16
 ; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v5, v57
-; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:  .LBB103_3: ; %Flow
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v47, v49
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v50, v2
+; SI-NEXT:    v_mov_b32_e32 v36, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v43, v9
+; SI-NEXT:    v_mov_b32_e32 v12, v31
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v31, v11
+; SI-NEXT:    v_mov_b32_e32 v9, v17
 ; SI-NEXT:    s_cbranch_vccnz .LBB103_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v62
-; SI-NEXT:    v_mov_b32_e32 v26, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
-; SI-NEXT:    v_mov_b32_e32 v1, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v24
-; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT:    v_add_f32_e32 v45, 0x38000000, v45
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v25
-; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v29
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v62
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v10
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v8
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v10
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v55
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v41
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v42
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v14
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v52
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v61
 ; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v41
+; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v15
+; SI-NEXT:    v_mov_b32_e32 v6, v37
+; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v39
+; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v52
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v51
+; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v60
+; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v50
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v13
+; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, v43
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v44
-; SI-NEXT:    v_add_f32_e32 v53, 0x38000000, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v18
+; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
+; SI-NEXT:    v_add_f32_e32 v45, 0x38000000, v45
 ; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
 ; SI-NEXT:    v_add_f32_e32 v43, 0x38000000, v43
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v10
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v14
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v49
+; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v52, 0x38000000, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v37
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v10
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v14
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v15
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v56
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v47
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v27
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v15
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v52, 0x38000000, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v35
+; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v38
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v46
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
+; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v44, 0x38000000, v44
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v46, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v46, 0x38000000, v46
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v47, 0x38000000, v47
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v56, 0x38000000, v56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v26
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v57, 0x38000000, v57
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v10
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v35
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v58, 0x38000000, v58
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v26, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v59, 0x38000000, v59
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v59
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v60, 0x38000000, v60
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v61, 0x38000000, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v61
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v62, 0x38000000, v62
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v62
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v63, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v63, 0x38000000, v63
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v63
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -235465,22 +235455,14 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v2
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v58
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v30
-; SI-NEXT:    v_mov_b32_e32 v30, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
@@ -235504,7 +235486,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v44
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v42
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v41
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -235539,92 +235521,99 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v32
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v12
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v28
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v6
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v14
+; SI-NEXT:    v_mov_b32_e32 v16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v37
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v49
-; SI-NEXT:    v_mov_b32_e32 v8, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v5
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v3
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v27
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v3, v13
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; SI-NEXT:  .LBB103_5: ; %end
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -235714,7 +235703,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -235775,19 +235764,17 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v31
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -235797,8 +235784,8 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -235807,7 +235794,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -235817,100 +235804,102 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v56
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v16
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v57
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v45
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v61
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v59
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v57
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v60
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v33
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v58
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v10
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v44
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v38
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v34
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v50
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v27
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v58
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v23
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v19
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v22
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v46
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v24
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v31
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v40
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v48
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v40
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v45
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v21
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v28
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v25
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v29
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v47
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v30
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v20
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v26
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v7
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -236262,17 +236251,17 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:136
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48
@@ -236291,170 +236280,169 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:88
-; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v1
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:88
+; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v3
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v7
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v11
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v12
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v15
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v19
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v20
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v23
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v27
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v28
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v22
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v30
-; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v29
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v26
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v17
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v21
+; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v29
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v30
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v35
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v34
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v35
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v37
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v39
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v48
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v37
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v49
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v52
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:120
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v38
-; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v39
-; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v50
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v44
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
+; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v44
 ; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v46
-; SI-NEXT:    v_mul_f32_e32 v7, 1.0, v51
-; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v52
-; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v40
+; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v38
+; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v49
+; SI-NEXT:    v_mul_f32_e32 v7, 1.0, v50
 ; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v58
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v59
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v59
+; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v51
 ; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v41
 ; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v42
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v47
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v56
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; kill: killed $vgpr45
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v45
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; kill: killed $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v47
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v56
 ; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v57
 ; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v60
 ; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v62
 ; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v63
-; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v43
 ; SI-NEXT:    ; kill: killed $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; implicit-def: $vgpr63
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -236469,96 +236457,106 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; kill: killed $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; kill: killed $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v1
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v3
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v23
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v20
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v21
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v28
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:124
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:128
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:132
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v22
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:128
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v25
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v21
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; kill: killed $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v34
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; kill: killed $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB104_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v34
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v33
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v43
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v30
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
@@ -236567,10 +236565,14 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v47
 ; SI-NEXT:    ; kill: killed $vgpr1
@@ -236582,7 +236584,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v29
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
@@ -236590,7 +236592,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v60
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v57
 ; SI-NEXT:    ; kill: killed $vgpr1
@@ -236602,277 +236604,265 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v23
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v27
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v59
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v58
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v27
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v25
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v28
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v40
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v44
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v24
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v22
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr24
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v28
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v22
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v55
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v40
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v54
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v32
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v19
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v21
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v13
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v20
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v28
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v9
+; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v22
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v32
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v31
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v31
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v55
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr55
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v19
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v17
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v22
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v54
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v53
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v53
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v17
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v22
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v11
+; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:  .LBB104_2: ; %Flow
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB104_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v34
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v33
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v43
 ; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; SI-NEXT:    v_alignbit_b32 v28, v34, v28, 16
-; SI-NEXT:    v_and_b32_e32 v34, 0xffff0000, v54
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v22, v34, v22, 16
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v55
-; SI-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v34
-; SI-NEXT:    v_alignbit_b32 v28, v35, v28, 16
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v40
+; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v32
+; SI-NEXT:    v_alignbit_b32 v22, v34, v22, 16
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v32
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
 ; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
-; SI-NEXT:    v_alignbit_b32 v28, v32, v28, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v31
+; SI-NEXT:    v_alignbit_b32 v22, v34, v22, 16
+; SI-NEXT:    v_and_b32_e32 v34, 0xffff0000, v53
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v53
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v54
+; SI-NEXT:    v_add_f32_e32 v51, 0x40c00000, v34
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v51
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_alignbit_b32 v22, v34, v22, 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v14
+; SI-NEXT:    v_alignbit_b32 v15, v22, v15, 16
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT:    v_alignbit_b32 v28, v32, v28, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v12
-; SI-NEXT:    v_alignbit_b32 v13, v28, v13, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v19
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
+; SI-NEXT:    v_alignbit_b32 v15, v19, v15, 16
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v21
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v18
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
-; SI-NEXT:    v_alignbit_b32 v13, v21, v13, 16
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
+; SI-NEXT:    v_alignbit_b32 v13, v18, v13, 16
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v13
-; SI-NEXT:    v_alignbit_b32 v11, v20, v11, 16
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_alignbit_b32 v11, v19, v11, 16
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v11
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v17
+; SI-NEXT:    v_alignbit_b32 v13, v17, v13, 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v9
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT:    v_alignbit_b32 v11, v17, v11, 16
+; SI-NEXT:    v_alignbit_b32 v10, v13, v10, 16
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_alignbit_b32 v8, v11, v8, 16
+; SI-NEXT:    v_alignbit_b32 v8, v10, v8, 16
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
@@ -236883,256 +236873,254 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v6, v8, v6, 16
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; SI-NEXT:    v_alignbit_b32 v4, v6, v4, 16
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v60
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v61
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v29
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
 ; SI-NEXT:    v_alignbit_b32 v4, v8, v4, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v16
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v27
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v29
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v23
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_alignbit_b32 v4, v11, v4, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v26
+; SI-NEXT:    v_alignbit_b32 v4, v10, v4, 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v24
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v19, v1, v3, 16
+; SI-NEXT:    v_alignbit_b32 v61, v1, v3, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v58
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v58
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v59
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v27
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v20, v11, v3, 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v16
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v19, v10, v3, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v56
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v56
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v57
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_alignbit_b32 v4, v16, v4, 16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v30
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_alignbit_b32 v4, v13, v4, 16
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v21, v11, v8, 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v20
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v26
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v20, v10, v8, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v46
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v46
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v47
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_alignbit_b32 v4, v17, v4, 16
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v22
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v22, v11, v8, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_alignbit_b32 v4, v16, v4, 16
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v21
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v21, v10, v8, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v44
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v28
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v23, v11, v8, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v23, v10, v8, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v40
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v30
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v24, v11, v8, 16
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v25
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_alignbit_b32 v45, v17, v4, 16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT:    v_alignbit_b32 v18, v16, v4, 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v13
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v7, v24, v7, 16
 ; SI-NEXT:    v_alignbit_b32 v5, v23, v5, 16
-; SI-NEXT:    v_alignbit_b32 v2, v22, v2, 16
-; SI-NEXT:    v_alignbit_b32 v1, v19, v1, 16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v2, v21, v2, 16
+; SI-NEXT:    v_alignbit_b32 v1, v61, v1, 16
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v25, v11, v8, 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v24, v10, v8, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v7, v24, v7, 16
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v8, v25, v8, 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
 ; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v17
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_alignbit_b32 v25, v45, v8, 16
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v9
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v8, v25, v8, 16
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_alignbit_b32 v62, v63, v16, 16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v33
 ; SI-NEXT:    v_alignbit_b32 v16, v62, v16, 16
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_alignbit_b32 v22, v34, v9, 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v9, v22, v9, 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_alignbit_b32 v35, v36, v11, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_alignbit_b32 v37, v38, v13, 16
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_alignbit_b32 v37, v38, v11, 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v11, v37, v11, 16
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v15
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v10, v37, v10, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v35, v11, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    v_alignbit_b32 v39, v48, v12, 16
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_alignbit_b32 v35, v36, v10, 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v12, v39, v12, 16
+; SI-NEXT:    v_alignbit_b32 v10, v35, v10, 16
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_alignbit_b32 v39, v48, v13, 16
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v12, v39, v12, 16
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_alignbit_b32 v49, v50, v13, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_alignbit_b32 v28, v43, v9, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v41, v42, v15, 16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v34
-; SI-NEXT:    v_alignbit_b32 v15, v41, v15, 16
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v51
 ; SI-NEXT:    v_alignbit_b32 v13, v49, v13, 16
-; SI-NEXT:    v_alignbit_b32 v9, v28, v9, 16
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_alignbit_b32 v51, v52, v14, 16
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v31
 ; SI-NEXT:    v_alignbit_b32 v14, v51, v14, 16
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_alignbit_b32 v41, v42, v15, 16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v32
+; SI-NEXT:    v_alignbit_b32 v15, v41, v15, 16
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v21, v6, 16
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v20, v3, 16
+; SI-NEXT:    v_alignbit_b32 v2, v20, v6, 16
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v45, v4, 16
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v2, v19, v3, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v1, v18, v4, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB104_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237147,7 +237135,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237163,7 +237151,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237179,7 +237167,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237195,7 +237183,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237211,7 +237199,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237227,7 +237215,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237243,7 +237231,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237252,14 +237240,14 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v43
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237269,17 +237257,15 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v45
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237298,8 +237284,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237318,8 +237304,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237339,7 +237325,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237358,8 +237344,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237378,8 +237364,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237387,11 +237373,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v61
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -237399,7 +237383,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -237407,10 +237391,12 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v45
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -239771,892 +239757,282 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76
 ; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v4
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v3
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v7
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v6
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v11
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v7
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v12
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v3
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v4
 ; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v5
 ; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v14
-; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v17
-; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v19
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v21
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v22
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v23
-; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v24
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v19
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v20
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v21
+; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v23
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v25
 ; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v26
-; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v27
-; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v28
+; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v27
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v28
 ; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v29
 ; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v30
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s18
 ; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v24, 1.0, s28
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s29
+; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v24, 1.0, s29
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v32
-; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v33
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v32
+; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v34
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v36
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v34
 ; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v35
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v37
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v38
-; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v39
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v48
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v49
-; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v50
-; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v37
+; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v38
+; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v39
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v48
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v49
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v52
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v53
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v54
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v55
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v42
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v50
+; SI-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v54
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v52
+; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v53
+; SI-NEXT:    s_waitcnt vmcnt(6) expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v43
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v40
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v41
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e64 v39, 1.0, s17
+; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v55
+; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v40
+; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v41
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v42
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e64 v53, 1.0, s17
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s21
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v51, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s26
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT:    s_cbranch_scc0 .LBB105_4
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    s_cbranch_scc0 .LBB105_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v31
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v52
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v29
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v51
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v25
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v50
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v62
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v29
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v24
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v44
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v53
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v45
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v39
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v11, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v24
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v47
-; SI-NEXT:    v_mov_b32_e32 v39, v13
-; SI-NEXT:    v_mov_b32_e32 v31, v6
-; SI-NEXT:    v_mov_b32_e32 v29, v61
-; SI-NEXT:    v_mov_b32_e32 v24, v56
-; SI-NEXT:    v_mov_b32_e32 v38, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v44
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v57
+; SI-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NEXT:    v_mov_b32_e32 v42, v62
+; SI-NEXT:    v_mov_b32_e32 v43, v63
+; SI-NEXT:    v_mov_b32_e32 v55, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v25
 ; SI-NEXT:    v_mov_b32_e32 v25, v60
+; SI-NEXT:    v_mov_b32_e32 v54, v47
+; SI-NEXT:    v_mov_b32_e32 v40, v20
+; SI-NEXT:    v_mov_b32_e32 v51, v61
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v46
+; SI-NEXT:    v_mov_b32_e32 v29, v31
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v31
+; SI-NEXT:    v_mov_b32_e32 v24, v56
 ; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_mov_b32_e32 v55, v12
-; SI-NEXT:    v_mov_b32_e32 v40, v58
-; SI-NEXT:    v_mov_b32_e32 v52, v59
-; SI-NEXT:    v_mov_b32_e32 v43, v37
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; SI-NEXT:    v_mov_b32_e32 v52, v10
+; SI-NEXT:    v_mov_b32_e32 v53, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
 ; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_mov_b32_e32 v62, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v61
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v46
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v45
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v57
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v42, v1
+; SI-NEXT:    v_mov_b32_e32 v50, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v41, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v60
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v63
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v61
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v56
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v54
-; SI-NEXT:    v_mov_b32_e32 v54, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v53
-; SI-NEXT:    v_mov_b32_e32 v53, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v58
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v59
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v34
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v62
+; SI-NEXT:    v_mov_b32_e32 v62, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v63
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v34
-; SI-NEXT:    v_mov_b32_e32 v34, v13
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
-; SI-NEXT:    v_mov_b32_e32 v13, v4
-; SI-NEXT:    s_cbranch_execnz .LBB105_3
-; SI-NEXT:  .LBB105_2: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v52
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v32
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v39
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v7
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v62
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; SI-NEXT:    v_and_b32_e32 v37, 0xffff0000, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v9, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v11, v1, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v41
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v41, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v60
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v47
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v33
+; SI-NEXT:    v_mov_b32_e32 v33, v34
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v42
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
-; SI-NEXT:    v_alignbit_b32 v1, v14, v1, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v29
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v34
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
-; SI-NEXT:    v_alignbit_b32 v1, v16, v1, 16
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v56
+; SI-NEXT:    v_mov_b32_e32 v39, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v37
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v21
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
-; SI-NEXT:    v_alignbit_b32 v1, v18, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v53, v20, v1, 16
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v39
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v55
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v20
-; SI-NEXT:    v_alignbit_b32 v54, v22, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v40
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v23
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v24
-; SI-NEXT:    v_alignbit_b32 v1, v23, v1, 16
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v38
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
-; SI-NEXT:    v_alignbit_b32 v1, v26, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v43
-; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v27
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
-; SI-NEXT:    v_alignbit_b32 v17, v27, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v33
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v33
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v25
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v25
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v24
-; SI-NEXT:    v_and_b32_e32 v34, 0xffff0000, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v38, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v8
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v31
-; SI-NEXT:    v_alignbit_b32 v28, v28, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v48
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v37, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v8
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v30
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v32
-; SI-NEXT:    v_alignbit_b32 v30, v30, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v36, v35, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v52, 0xffff0000, v32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v32
-; SI-NEXT:    v_alignbit_b32 v48, v49, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v31
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v10
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v31
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v29
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v12
-; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v26, v58, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v23, v27, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v21, v23, v21, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v56, v59, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v12, v22, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v63, v1, v20, 16
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT:    v_alignbit_b32 v47, v63, v19, 16
-; SI-NEXT:    v_alignbit_b32 v19, v56, v13, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v24
-; SI-NEXT:    v_alignbit_b32 v60, v18, v20, 16
-; SI-NEXT:    v_alignbit_b32 v16, v60, v16, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v29
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v61, v7, v25, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v33
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v57, v7, v31, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v31
-; SI-NEXT:    v_alignbit_b32 v24, v46, v11, 16
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v11
-; SI-NEXT:    v_alignbit_b32 v25, v45, v9, 16
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v44, v15, v3, 16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
-; SI-NEXT:    v_alignbit_b32 v9, v3, v9, 16
-; SI-NEXT:    v_alignbit_b32 v2, v20, v2, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v50, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v50, 0x40c00000, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v50
-; SI-NEXT:    v_alignbit_b32 v50, v51, v39, 16
-; SI-NEXT:    v_alignbit_b32 v4, v50, v4, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v9, v5, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v2, v6, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v44, v38, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v25, v37, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v24, v34, 16
-; SI-NEXT:    v_mov_b32_e32 v7, v25
-; SI-NEXT:    v_mov_b32_e32 v37, v17
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v57, v33, 16
-; SI-NEXT:    v_alignbit_b32 v6, v61, v14, 16
-; SI-NEXT:    v_alignbit_b32 v17, v12, v32, 16
-; SI-NEXT:    v_alignbit_b32 v14, v26, v10, 16
-; SI-NEXT:    v_alignbit_b32 v32, v48, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v36, v52, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:  .LBB105_3: ; %end
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v51
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v8, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v4, v8, v4
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; SI-NEXT:    v_or_b32_e32 v4, v4, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v20
-; SI-NEXT:    v_or_b32_e32 v4, v4, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 16, v0
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v46
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v57
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v61
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v18
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v47
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v63
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v56
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v59
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v21
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v14
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v58
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    s_setpc_b64 s[30:31]
-; SI-NEXT:  .LBB105_4:
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v39, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v34
+; SI-NEXT:    s_branch .LBB105_3
+; SI-NEXT:  .LBB105_2:
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v55, v12
 ; SI-NEXT:    v_mov_b32_e32 v33, v34
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -240688,73 +240064,695 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v7, v5
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v11, v5
-; SI-NEXT:    v_mov_b32_e32 v31, v6
-; SI-NEXT:    v_mov_b32_e32 v29, v61
+; SI-NEXT:    ; kill: killed $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    v_mov_b32_e32 v51, v61
+; SI-NEXT:    v_mov_b32_e32 v42, v62
+; SI-NEXT:    v_mov_b32_e32 v29, v31
 ; SI-NEXT:    v_mov_b32_e32 v25, v60
 ; SI-NEXT:    v_mov_b32_e32 v24, v56
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v47
-; SI-NEXT:    v_mov_b32_e32 v55, v12
-; SI-NEXT:    v_mov_b32_e32 v40, v58
-; SI-NEXT:    v_mov_b32_e32 v52, v59
-; SI-NEXT:    v_mov_b32_e32 v38, v4
-; SI-NEXT:    v_mov_b32_e32 v43, v37
+; SI-NEXT:    v_mov_b32_e32 v54, v47
+; SI-NEXT:    v_mov_b32_e32 v40, v20
+; SI-NEXT:    v_mov_b32_e32 v43, v63
+; SI-NEXT:    v_mov_b32_e32 v52, v10
+; SI-NEXT:    v_mov_b32_e32 v53, v59
+; SI-NEXT:    v_mov_b32_e32 v39, v4
+; SI-NEXT:    v_mov_b32_e32 v37, v38
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr63
 ; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; kill: killed $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; kill: killed $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr20
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    s_branch .LBB105_2
+; SI-NEXT:  .LBB105_3: ; %Flow
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_cbranch_vccnz .LBB105_5
+; SI-NEXT:  ; %bb.4: ; %cmp.true
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v40
+; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v55
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v39
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v37
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v32
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v34
+; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v33
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v51
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; SI-NEXT:    v_alignbit_b32 v1, v9, v1, 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v50
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; SI-NEXT:    v_alignbit_b32 v1, v11, v1, 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v41
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v42
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
+; SI-NEXT:    v_alignbit_b32 v1, v14, v1, 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v25
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
+; SI-NEXT:    v_alignbit_b32 v51, v16, v1, 16
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v54
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v24
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
+; SI-NEXT:    v_alignbit_b32 v1, v18, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v24
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v53
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v31
+; SI-NEXT:    v_and_b32_e32 v38, 0xffff0000, v2
+; SI-NEXT:    v_and_b32_e32 v34, 0xffff0000, v9
+; SI-NEXT:    v_and_b32_e32 v37, 0xffff0000, v3
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v18, v20, v1, 16
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v7
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v43
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v20
+; SI-NEXT:    v_alignbit_b32 v1, v22, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v52
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v1, v23, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v12
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v1, v26, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v32
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v29
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v33
+; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v1, v27, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v12
+; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v25
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v24
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v24
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v1, v28, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v52, v30, v1, 16
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v36, v35, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
+; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v32
+; SI-NEXT:    v_alignbit_b32 v48, v49, v1, 16
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v31
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v20
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v28, v59, v1, 16
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v26, v28, v26, 16
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT:    v_alignbit_b32 v46, v61, v31, 16
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v21, v30, v1, 16
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v31
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v23, v10, v1, 16
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v63, v23, v27, 16
+; SI-NEXT:    v_alignbit_b32 v27, v21, v12, 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v57, v58, v1, 16
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_alignbit_b32 v17, v1, v20, 16
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT:    v_alignbit_b32 v19, v17, v19, 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v24
+; SI-NEXT:    v_alignbit_b32 v56, v47, v20, 16
+; SI-NEXT:    v_alignbit_b32 v20, v62, v11, 16
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v16, v56, v16, 16
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v22, v45, v9, 16
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v29
+; SI-NEXT:    v_alignbit_b32 v13, v60, v25, 16
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v9
+; SI-NEXT:    v_alignbit_b32 v24, v44, v3, 16
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v39, 0xffff0000, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_alignbit_b32 v9, v11, v9, 16
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v50, 0xffff0000, v15
+; SI-NEXT:    v_mov_b32_e32 v15, v24
+; SI-NEXT:    v_add_f32_e32 v50, 0x40c00000, v50
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v3, v3, v39, 16
+; SI-NEXT:    v_alignbit_b32 v4, v3, v4, 16
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v4, v9, v5, 16
+; SI-NEXT:    v_alignbit_b32 v5, v36, v7, 16
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v4, v2, v6, 16
+; SI-NEXT:    v_alignbit_b32 v6, v46, v33, 16
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v4, v24, v38, 16
+; SI-NEXT:    v_alignbit_b32 v38, v48, v8, 16
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v4, v22, v37, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v22, v57, v32, 16
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v4, v20, v34, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v20, v52
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v4, v13, v14, 16
+; SI-NEXT:    v_mov_b32_e32 v14, v51
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:  .LBB105_5: ; %end
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; SI-NEXT:    v_or_b32_e32 v7, v7, v8
+; SI-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; SI-NEXT:    v_or_b32_e32 v7, v7, v8
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; SI-NEXT:    v_or_b32_e32 v4, v7, v4
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; SI-NEXT:    v_or_b32_e32 v4, v4, v7
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; SI-NEXT:    v_or_b32_e32 v4, v4, v7
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v44
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v61
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v60
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v47
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v17
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v58
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v63
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v26
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v59
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v64bf16_to_v64i16_scalar:
 ; VI:       ; %bb.0:
@@ -243836,50 +243834,50 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_mov_b32 s60, s16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v41, s17, 0
+; SI-NEXT:    s_mov_b32 s61, s19
 ; SI-NEXT:    v_writelane_b32 v41, s60, 1
-; SI-NEXT:    s_mov_b32 s61, s18
-; SI-NEXT:    v_writelane_b32 v41, s19, 2
+; SI-NEXT:    s_mov_b32 s63, s18
+; SI-NEXT:    v_writelane_b32 v41, s61, 2
 ; SI-NEXT:    s_mov_b32 s72, s21
-; SI-NEXT:    v_writelane_b32 v41, s61, 3
+; SI-NEXT:    v_writelane_b32 v41, s63, 3
 ; SI-NEXT:    v_writelane_b32 v41, s72, 4
 ; SI-NEXT:    s_mov_b32 s74, s23
 ; SI-NEXT:    v_writelane_b32 v41, s20, 5
 ; SI-NEXT:    v_writelane_b32 v41, s74, 6
-; SI-NEXT:    s_mov_b32 s76, s25
+; SI-NEXT:    s_mov_b32 s75, s25
 ; SI-NEXT:    v_writelane_b32 v41, s22, 7
-; SI-NEXT:    v_writelane_b32 v41, s76, 8
-; SI-NEXT:    s_mov_b32 s78, s27
+; SI-NEXT:    v_writelane_b32 v41, s75, 8
+; SI-NEXT:    s_mov_b32 s76, s27
 ; SI-NEXT:    v_writelane_b32 v41, s24, 9
-; SI-NEXT:    v_writelane_b32 v41, s78, 10
-; SI-NEXT:    s_mov_b32 s79, s29
+; SI-NEXT:    v_writelane_b32 v41, s76, 10
+; SI-NEXT:    s_mov_b32 s93, s29
 ; SI-NEXT:    v_writelane_b32 v41, s26, 11
-; SI-NEXT:    v_writelane_b32 v41, s79, 12
-; SI-NEXT:    v_readfirstlane_b32 s6, v2
+; SI-NEXT:    v_writelane_b32 v41, s93, 12
+; SI-NEXT:    v_readfirstlane_b32 s16, v2
 ; SI-NEXT:    v_writelane_b32 v41, s28, 13
 ; SI-NEXT:    v_readfirstlane_b32 s73, v4
-; SI-NEXT:    v_writelane_b32 v41, s6, 14
-; SI-NEXT:    v_readfirstlane_b32 s95, v3
+; SI-NEXT:    v_writelane_b32 v41, s16, 14
+; SI-NEXT:    v_readfirstlane_b32 s89, v3
 ; SI-NEXT:    v_writelane_b32 v41, s73, 15
-; SI-NEXT:    v_readfirstlane_b32 s36, v6
-; SI-NEXT:    v_writelane_b32 v41, s95, 16
-; SI-NEXT:    v_readfirstlane_b32 s30, v5
-; SI-NEXT:    v_writelane_b32 v41, s36, 17
-; SI-NEXT:    v_readfirstlane_b32 s37, v8
-; SI-NEXT:    v_writelane_b32 v41, s30, 18
-; SI-NEXT:    v_readfirstlane_b32 s38, v7
-; SI-NEXT:    v_writelane_b32 v41, s37, 19
-; SI-NEXT:    v_readfirstlane_b32 s39, v10
-; SI-NEXT:    v_writelane_b32 v41, s38, 20
+; SI-NEXT:    v_readfirstlane_b32 s90, v6
+; SI-NEXT:    v_writelane_b32 v41, s89, 16
+; SI-NEXT:    v_readfirstlane_b32 s91, v5
+; SI-NEXT:    v_writelane_b32 v41, s90, 17
+; SI-NEXT:    v_readfirstlane_b32 s34, v8
+; SI-NEXT:    v_writelane_b32 v41, s91, 18
+; SI-NEXT:    v_readfirstlane_b32 s35, v7
+; SI-NEXT:    v_writelane_b32 v41, s34, 19
+; SI-NEXT:    v_readfirstlane_b32 s36, v10
+; SI-NEXT:    v_writelane_b32 v41, s35, 20
 ; SI-NEXT:    v_writelane_b32 v40, s96, 32
-; SI-NEXT:    v_readfirstlane_b32 s48, v9
-; SI-NEXT:    v_writelane_b32 v41, s39, 21
-; SI-NEXT:    v_writelane_b32 v40, s97, 33
+; SI-NEXT:    v_readfirstlane_b32 s37, v9
+; SI-NEXT:    v_writelane_b32 v41, s36, 21
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s92, v31
+; SI-NEXT:    v_readfirstlane_b32 s62, v31
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s93, v32
+; SI-NEXT:    v_readfirstlane_b32 s80, v32
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_readfirstlane_b32 s90, v33
+; SI-NEXT:    v_readfirstlane_b32 s69, v33
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
@@ -243891,19 +243889,20 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s84, v34
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s94, v35
+; SI-NEXT:    v_readfirstlane_b32 s68, v35
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s83, v36
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s87, v38
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:80
-; SI-NEXT:    v_readfirstlane_b32 s91, v37
+; SI-NEXT:    v_readfirstlane_b32 s6, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
-; SI-NEXT:    v_readfirstlane_b32 s49, v12
-; SI-NEXT:    v_writelane_b32 v41, s48, 22
+; SI-NEXT:    v_writelane_b32 v40, s97, 33
+; SI-NEXT:    v_readfirstlane_b32 s38, v12
+; SI-NEXT:    v_writelane_b32 v41, s37, 22
 ; SI-NEXT:    v_writelane_b32 v40, s98, 34
 ; SI-NEXT:    v_readfirstlane_b32 s14, v30
 ; SI-NEXT:    v_readfirstlane_b32 s15, v29
@@ -243913,21 +243912,21 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s11, v25
 ; SI-NEXT:    v_readfirstlane_b32 s8, v24
 ; SI-NEXT:    v_readfirstlane_b32 s9, v23
-; SI-NEXT:    v_readfirstlane_b32 s89, v22
-; SI-NEXT:    v_readfirstlane_b32 s7, v21
-; SI-NEXT:    v_readfirstlane_b32 s88, v20
-; SI-NEXT:    v_readfirstlane_b32 s29, v19
-; SI-NEXT:    v_readfirstlane_b32 s77, v18
-; SI-NEXT:    v_readfirstlane_b32 s27, v17
-; SI-NEXT:    v_readfirstlane_b32 s75, v16
-; SI-NEXT:    v_readfirstlane_b32 s25, v15
-; SI-NEXT:    v_readfirstlane_b32 s50, v14
-; SI-NEXT:    v_readfirstlane_b32 s23, v13
-; SI-NEXT:    v_readfirstlane_b32 s21, v11
+; SI-NEXT:    v_readfirstlane_b32 s88, v22
+; SI-NEXT:    v_readfirstlane_b32 s29, v21
+; SI-NEXT:    v_readfirstlane_b32 s79, v20
+; SI-NEXT:    v_readfirstlane_b32 s27, v19
+; SI-NEXT:    v_readfirstlane_b32 s78, v18
+; SI-NEXT:    v_readfirstlane_b32 s25, v17
+; SI-NEXT:    v_readfirstlane_b32 s77, v16
+; SI-NEXT:    v_readfirstlane_b32 s23, v15
+; SI-NEXT:    v_readfirstlane_b32 s39, v14
+; SI-NEXT:    v_readfirstlane_b32 s21, v13
+; SI-NEXT:    v_readfirstlane_b32 s19, v11
 ; SI-NEXT:    v_readfirstlane_b32 s18, v1
-; SI-NEXT:    v_writelane_b32 v41, s49, 23
+; SI-NEXT:    v_writelane_b32 v41, s38, 23
 ; SI-NEXT:    v_writelane_b32 v40, s99, 35
-; SI-NEXT:    v_writelane_b32 v41, s50, 24
+; SI-NEXT:    v_writelane_b32 v41, s39, 24
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s58, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
@@ -243956,166 +243955,165 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB107_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 25
-; SI-NEXT:    s_lshl_b32 s5, s61, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 26
-; SI-NEXT:    s_lshl_b32 s5, s20, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 27
-; SI-NEXT:    s_lshl_b32 s5, s22, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 28
-; SI-NEXT:    s_lshl_b32 s5, s24, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 29
-; SI-NEXT:    s_lshl_b32 s5, s26, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 30
-; SI-NEXT:    s_lshl_b32 s5, s28, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 31
-; SI-NEXT:    s_lshl_b32 s5, s18, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 32
-; SI-NEXT:    s_lshl_b32 s5, s95, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 33
-; SI-NEXT:    s_lshl_b32 s5, s38, 16
 ; SI-NEXT:    s_lshl_b32 s4, s60, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 34
-; SI-NEXT:    s_lshl_b32 s5, s48, 16
-; SI-NEXT:    s_lshl_b32 s16, s19, 16
-; SI-NEXT:    s_lshl_b32 s63, s72, 16
-; SI-NEXT:    s_lshl_b32 s98, s74, 16
-; SI-NEXT:    s_lshl_b32 s62, s76, 16
-; SI-NEXT:    s_lshl_b32 s96, s78, 16
-; SI-NEXT:    s_lshl_b32 s31, s79, 16
-; SI-NEXT:    s_lshl_b32 s34, s6, 16
-; SI-NEXT:    s_lshl_b32 s35, s73, 16
-; SI-NEXT:    s_lshl_b32 s30, s30, 16
-; SI-NEXT:    s_lshl_b32 s36, s36, 16
-; SI-NEXT:    s_lshl_b32 s99, s37, 16
-; SI-NEXT:    v_writelane_b32 v41, s5, 35
+; SI-NEXT:    v_writelane_b32 v41, s4, 25
+; SI-NEXT:    s_lshl_b32 s4, s63, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 26
+; SI-NEXT:    s_lshl_b32 s4, s20, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 27
+; SI-NEXT:    s_lshl_b32 s4, s22, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 28
+; SI-NEXT:    s_lshl_b32 s4, s24, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 29
+; SI-NEXT:    s_lshl_b32 s4, s26, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 30
+; SI-NEXT:    s_lshl_b32 s4, s28, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 31
+; SI-NEXT:    s_lshl_b32 s4, s18, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 32
+; SI-NEXT:    s_lshl_b32 s4, s89, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 33
+; SI-NEXT:    s_lshl_b32 s4, s91, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 34
+; SI-NEXT:    s_lshl_b32 s4, s35, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 35
+; SI-NEXT:    s_lshl_b32 s4, s37, 16
+; SI-NEXT:    s_lshl_b32 s7, s17, 16
+; SI-NEXT:    s_lshl_b32 s96, s61, 16
+; SI-NEXT:    s_lshl_b32 s99, s72, 16
+; SI-NEXT:    s_lshl_b32 s97, s74, 16
+; SI-NEXT:    s_lshl_b32 s92, s75, 16
+; SI-NEXT:    s_lshl_b32 s94, s76, 16
+; SI-NEXT:    s_lshl_b32 s95, s93, 16
+; SI-NEXT:    s_lshl_b32 s93, s16, 16
+; SI-NEXT:    s_lshl_b32 s30, s73, 16
+; SI-NEXT:    s_lshl_b32 s31, s90, 16
+; SI-NEXT:    s_lshl_b32 s34, s34, 16
+; SI-NEXT:    v_writelane_b32 v41, s4, 36
+; SI-NEXT:    s_lshl_b32 s35, s36, 16
+; SI-NEXT:    s_lshl_b32 s86, s19, 16
+; SI-NEXT:    s_lshl_b32 s36, s38, 16
+; SI-NEXT:    s_lshl_b32 s22, s21, 16
 ; SI-NEXT:    s_lshl_b32 s37, s39, 16
-; SI-NEXT:    s_lshl_b32 s19, s21, 16
-; SI-NEXT:    s_lshl_b32 s38, s49, 16
-; SI-NEXT:    s_lshl_b32 s20, s23, 16
-; SI-NEXT:    s_lshl_b32 s39, s50, 16
-; SI-NEXT:    s_lshl_b32 s22, s25, 16
-; SI-NEXT:    s_lshl_b32 s48, s75, 16
-; SI-NEXT:    s_lshl_b32 s60, s27, 16
-; SI-NEXT:    s_lshl_b32 s49, s77, 16
-; SI-NEXT:    s_lshl_b32 s24, s29, 16
-; SI-NEXT:    s_lshl_b32 s50, s88, 16
-; SI-NEXT:    s_lshl_b32 s61, s7, 16
-; SI-NEXT:    s_lshl_b32 s51, s89, 16
-; SI-NEXT:    s_lshl_b32 s28, s9, 16
-; SI-NEXT:    s_lshl_b32 s52, s8, 16
-; SI-NEXT:    s_lshl_b32 s72, s11, 16
-; SI-NEXT:    s_lshl_b32 s53, s10, 16
-; SI-NEXT:    s_lshl_b32 s74, s13, 16
-; SI-NEXT:    s_lshl_b32 s54, s12, 16
-; SI-NEXT:    s_lshl_b32 s95, s15, 16
-; SI-NEXT:    s_lshl_b32 s55, s14, 16
-; SI-NEXT:    s_lshl_b32 s81, s41, 16
-; SI-NEXT:    s_lshl_b32 s64, s40, 16
-; SI-NEXT:    s_lshl_b32 s82, s43, 16
-; SI-NEXT:    s_lshl_b32 s65, s42, 16
-; SI-NEXT:    s_lshl_b32 s85, s45, 16
-; SI-NEXT:    s_lshl_b32 s66, s44, 16
-; SI-NEXT:    s_lshl_b32 s86, s47, 16
-; SI-NEXT:    s_lshl_b32 s67, s46, 16
-; SI-NEXT:    s_lshl_b32 s76, s57, 16
-; SI-NEXT:    s_lshl_b32 s68, s56, 16
-; SI-NEXT:    s_lshl_b32 s97, s59, 16
-; SI-NEXT:    s_lshl_b32 s69, s58, 16
-; SI-NEXT:    s_lshl_b32 s78, s87, 16
-; SI-NEXT:    s_mov_b32 s6, s91
-; SI-NEXT:    s_lshl_b32 s70, s91, 16
-; SI-NEXT:    s_lshl_b32 s79, s83, 16
-; SI-NEXT:    s_mov_b32 s73, s94
-; SI-NEXT:    s_lshl_b32 s71, s94, 16
-; SI-NEXT:    s_lshl_b32 s26, s84, 16
-; SI-NEXT:    s_mov_b32 s91, s90
-; SI-NEXT:    s_lshl_b32 s90, s90, 16
-; SI-NEXT:    s_mov_b32 s94, s93
-; SI-NEXT:    s_lshl_b32 s17, s93, 16
-; SI-NEXT:    s_mov_b32 s93, s92
-; SI-NEXT:    s_lshl_b32 s80, s92, 16
-; SI-NEXT:    s_mov_b32 s92, s4
+; SI-NEXT:    s_lshl_b32 s24, s23, 16
+; SI-NEXT:    s_lshl_b32 s38, s77, 16
+; SI-NEXT:    s_lshl_b32 s28, s25, 16
+; SI-NEXT:    s_lshl_b32 s39, s78, 16
+; SI-NEXT:    s_lshl_b32 s61, s27, 16
+; SI-NEXT:    s_lshl_b32 s48, s79, 16
+; SI-NEXT:    s_lshl_b32 s89, s29, 16
+; SI-NEXT:    s_lshl_b32 s49, s88, 16
+; SI-NEXT:    s_lshl_b32 s60, s9, 16
+; SI-NEXT:    s_lshl_b32 s50, s8, 16
+; SI-NEXT:    s_lshl_b32 s90, s11, 16
+; SI-NEXT:    s_lshl_b32 s91, s10, 16
+; SI-NEXT:    s_lshl_b32 s70, s13, 16
+; SI-NEXT:    s_lshl_b32 s51, s12, 16
+; SI-NEXT:    s_lshl_b32 s71, s15, 16
+; SI-NEXT:    s_lshl_b32 s52, s14, 16
+; SI-NEXT:    s_lshl_b32 s20, s41, 16
+; SI-NEXT:    s_lshl_b32 s53, s40, 16
+; SI-NEXT:    s_lshl_b32 s81, s43, 16
+; SI-NEXT:    s_lshl_b32 s54, s42, 16
+; SI-NEXT:    s_lshl_b32 s63, s45, 16
+; SI-NEXT:    s_lshl_b32 s55, s44, 16
+; SI-NEXT:    s_lshl_b32 s72, s47, 16
+; SI-NEXT:    s_lshl_b32 s64, s46, 16
+; SI-NEXT:    s_lshl_b32 s82, s57, 16
+; SI-NEXT:    s_lshl_b32 s65, s56, 16
+; SI-NEXT:    s_lshl_b32 s74, s59, 16
+; SI-NEXT:    s_lshl_b32 s66, s58, 16
+; SI-NEXT:    s_lshl_b32 s75, s87, 16
+; SI-NEXT:    s_mov_b32 s73, s6
+; SI-NEXT:    s_lshl_b32 s67, s6, 16
+; SI-NEXT:    s_lshl_b32 s76, s83, 16
+; SI-NEXT:    s_mov_b32 s16, s68
+; SI-NEXT:    s_lshl_b32 s68, s68, 16
+; SI-NEXT:    s_lshl_b32 s85, s84, 16
+; SI-NEXT:    s_mov_b32 s98, s69
+; SI-NEXT:    s_lshl_b32 s69, s69, 16
+; SI-NEXT:    s_lshl_b32 s17, s80, 16
+; SI-NEXT:    s_mov_b32 s6, s62
+; SI-NEXT:    s_lshl_b32 s26, s62, 16
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB107_3
 ; SI-NEXT:  .LBB107_2:
 ; SI-NEXT:    ; implicit-def: $sgpr17
 ; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    s_mov_b32 s73, s94
+; SI-NEXT:    s_mov_b32 s16, s68
 ; SI-NEXT:    ; implicit-def: $sgpr17
 ; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    s_mov_b32 s6, s91
+; SI-NEXT:    s_mov_b32 s73, s6
 ; SI-NEXT:    ; implicit-def: $sgpr17
 ; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    s_mov_b32 s94, s93
+; SI-NEXT:    s_mov_b32 s6, s62
 ; SI-NEXT:    ; implicit-def: $sgpr17
 ; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    s_mov_b32 s93, s92
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    s_mov_b32 s91, s90
+; SI-NEXT:    s_mov_b32 s98, s69
 ; SI-NEXT:    ; implicit-def: $sgpr17
 ; SI-NEXT:    ; kill: killed $sgpr17
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $sgpr17
 ; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    ; implicit-def: $sgpr16
-; SI-NEXT:    ; implicit-def: $sgpr92
-; SI-NEXT:    ; kill: killed $sgpr16
-; SI-NEXT:    ; implicit-def: $sgpr16
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; kill: killed $sgpr17
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $sgpr98
-; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr96
+; SI-NEXT:    ; implicit-def: $sgpr99
+; SI-NEXT:    ; implicit-def: $sgpr97
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr95
+; SI-NEXT:    ; implicit-def: $sgpr93
+; SI-NEXT:    ; implicit-def: $sgpr30
 ; SI-NEXT:    ; implicit-def: $sgpr31
 ; SI-NEXT:    ; implicit-def: $sgpr34
 ; SI-NEXT:    ; implicit-def: $sgpr35
-; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr86
 ; SI-NEXT:    ; implicit-def: $sgpr36
-; SI-NEXT:    ; implicit-def: $sgpr99
+; SI-NEXT:    ; implicit-def: $sgpr22
 ; SI-NEXT:    ; implicit-def: $sgpr37
-; SI-NEXT:    ; implicit-def: $sgpr19
+; SI-NEXT:    ; implicit-def: $sgpr24
 ; SI-NEXT:    ; implicit-def: $sgpr38
-; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $sgpr28
 ; SI-NEXT:    ; implicit-def: $sgpr39
-; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr61
 ; SI-NEXT:    ; implicit-def: $sgpr48
-; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr89
 ; SI-NEXT:    ; implicit-def: $sgpr49
-; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr60
 ; SI-NEXT:    ; implicit-def: $sgpr50
-; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr91
+; SI-NEXT:    ; implicit-def: $sgpr70
 ; SI-NEXT:    ; implicit-def: $sgpr51
-; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr71
 ; SI-NEXT:    ; implicit-def: $sgpr52
-; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr20
 ; SI-NEXT:    ; implicit-def: $sgpr53
-; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr81
 ; SI-NEXT:    ; implicit-def: $sgpr54
-; SI-NEXT:    ; implicit-def: $sgpr95
+; SI-NEXT:    ; implicit-def: $sgpr63
 ; SI-NEXT:    ; implicit-def: $sgpr55
-; SI-NEXT:    ; implicit-def: $sgpr81
+; SI-NEXT:    ; implicit-def: $sgpr72
 ; SI-NEXT:    ; implicit-def: $sgpr64
 ; SI-NEXT:    ; implicit-def: $sgpr82
 ; SI-NEXT:    ; implicit-def: $sgpr65
-; SI-NEXT:    ; implicit-def: $sgpr85
+; SI-NEXT:    ; implicit-def: $sgpr74
 ; SI-NEXT:    ; implicit-def: $sgpr66
-; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr67
 ; SI-NEXT:    ; implicit-def: $sgpr76
 ; SI-NEXT:    ; implicit-def: $sgpr68
-; SI-NEXT:    ; implicit-def: $sgpr97
+; SI-NEXT:    ; implicit-def: $sgpr85
 ; SI-NEXT:    ; implicit-def: $sgpr69
-; SI-NEXT:    ; implicit-def: $sgpr78
-; SI-NEXT:    ; implicit-def: $sgpr70
-; SI-NEXT:    ; implicit-def: $sgpr79
-; SI-NEXT:    ; implicit-def: $sgpr71
 ; SI-NEXT:    ; implicit-def: $sgpr26
-; SI-NEXT:    ; implicit-def: $sgpr90
-; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr17
+; SI-NEXT:    ; kill: killed $sgpr17
+; SI-NEXT:    ; implicit-def: $sgpr17
+; SI-NEXT:    ; kill: killed $sgpr17
+; SI-NEXT:    ; implicit-def: $sgpr17
+; SI-NEXT:    ; kill: killed $sgpr17
+; SI-NEXT:    ; implicit-def: $sgpr17
+; SI-NEXT:    ; kill: killed $sgpr17
 ; SI-NEXT:    ; implicit-def: $sgpr17
 ; SI-NEXT:    ; kill: killed $sgpr17
 ; SI-NEXT:    ; implicit-def: $sgpr17
@@ -244123,97 +244121,86 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    ; implicit-def: $sgpr17
 ; SI-NEXT:  .LBB107_3: ; %Flow
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT:    s_mov_b32 s4, s17
-; SI-NEXT:    s_mov_b32 s17, s30
-; SI-NEXT:    s_mov_b32 s30, s99
-; SI-NEXT:    s_mov_b32 s99, s16
+; SI-NEXT:    s_mov_b32 s5, s17
+; SI-NEXT:    s_mov_b32 s17, s86
+; SI-NEXT:    s_mov_b32 s86, s7
 ; SI-NEXT:    s_cbranch_vccnz .LBB107_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    s_add_i32 s4, s94, 3
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s93, 16
-; SI-NEXT:    s_add_i32 s84, s84, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s5, s84, 0xffff
-; SI-NEXT:    s_lshl_b32 s60, s91, 16
-; SI-NEXT:    s_add_i32 s83, s83, 3
-; SI-NEXT:    s_or_b32 s5, s60, s5
-; SI-NEXT:    s_and_b32 s60, s83, 0xffff
-; SI-NEXT:    s_lshl_b32 s61, s73, 16
-; SI-NEXT:    s_or_b32 s79, s61, s60
-; SI-NEXT:    s_lshl_b32 s61, s6, 16
-; SI-NEXT:    s_add_i32 s9, s9, 3
+; SI-NEXT:    s_lshl_b32 s5, s6, 16
 ; SI-NEXT:    v_readlane_b32 s6, v41, 24
-; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    s_and_b32 s9, s9, 0xffff
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_add_i32 s7, s7, 3
 ; SI-NEXT:    s_lshl_b32 s20, s6, 16
 ; SI-NEXT:    v_readlane_b32 s6, v41, 23
+; SI-NEXT:    s_lshl_b32 s17, s6, 16
+; SI-NEXT:    v_readlane_b32 s6, v41, 22
+; SI-NEXT:    s_lshl_b32 s61, s16, 16
+; SI-NEXT:    s_add_i32 s16, s6, 3
+; SI-NEXT:    v_readlane_b32 s6, v41, 21
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s6, 16
+; SI-NEXT:    v_readlane_b32 s6, v41, 20
+; SI-NEXT:    s_or_b32 s7, s7, s16
+; SI-NEXT:    s_add_i32 s6, s6, 3
+; SI-NEXT:    v_readlane_b32 s16, v41, 19
+; SI-NEXT:    s_add_i32 s19, s19, 3
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    s_and_b32 s19, s19, 0xffff
+; SI-NEXT:    s_or_b32 s6, s16, s6
+; SI-NEXT:    v_readlane_b32 s16, v41, 18
+; SI-NEXT:    s_lshl_b32 s60, s98, 16
+; SI-NEXT:    s_or_b32 s17, s17, s19
+; SI-NEXT:    s_add_i32 s98, s16, 3
+; SI-NEXT:    v_readlane_b32 s19, v41, 17
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_and_b32 s16, s98, 0xffff
+; SI-NEXT:    s_lshl_b32 s19, s19, 16
+; SI-NEXT:    s_add_i32 s11, s11, 3
+; SI-NEXT:    s_add_i32 s9, s9, 3
+; SI-NEXT:    s_and_b32 s21, s21, 0xffff
+; SI-NEXT:    s_or_b32 s16, s19, s16
+; SI-NEXT:    v_readlane_b32 s19, v41, 16
 ; SI-NEXT:    s_add_i32 s13, s13, 3
 ; SI-NEXT:    s_and_b32 s11, s11, 0xffff
 ; SI-NEXT:    s_lshl_b32 s10, s10, 16
-; SI-NEXT:    s_or_b32 s8, s8, s9
-; SI-NEXT:    s_and_b32 s7, s7, 0xffff
-; SI-NEXT:    s_lshl_b32 s9, s89, 16
+; SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_lshl_b32 s19, s6, 16
-; SI-NEXT:    v_readlane_b32 s6, v41, 22
+; SI-NEXT:    s_or_b32 s20, s20, s21
+; SI-NEXT:    s_add_i32 s96, s19, 3
+; SI-NEXT:    v_readlane_b32 s21, v41, 15
 ; SI-NEXT:    s_add_i32 s15, s15, 3
 ; SI-NEXT:    s_and_b32 s13, s13, 0xffff
 ; SI-NEXT:    s_lshl_b32 s12, s12, 16
 ; SI-NEXT:    s_or_b32 s10, s10, s11
-; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    s_or_b32 s8, s8, s9
 ; SI-NEXT:    s_and_b32 s9, s29, 0xffff
 ; SI-NEXT:    s_lshl_b32 s11, s88, 16
 ; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s16, s6, 3
-; SI-NEXT:    v_readlane_b32 s6, v41, 21
+; SI-NEXT:    s_and_b32 s19, s96, 0xffff
+; SI-NEXT:    s_lshl_b32 s21, s21, 16
 ; SI-NEXT:    s_and_b32 s15, s15, 0xffff
 ; SI-NEXT:    s_lshl_b32 s14, s14, 16
 ; SI-NEXT:    s_or_b32 s12, s12, s13
 ; SI-NEXT:    s_or_b32 s9, s11, s9
 ; SI-NEXT:    s_and_b32 s11, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s13, s77, 16
+; SI-NEXT:    s_lshl_b32 s13, s79, 16
 ; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_and_b32 s16, s16, 0xffff
-; SI-NEXT:    s_lshl_b32 s17, s6, 16
-; SI-NEXT:    v_readlane_b32 s6, v41, 20
+; SI-NEXT:    s_or_b32 s19, s21, s19
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    v_readlane_b32 s21, v41, 14
 ; SI-NEXT:    s_or_b32 s14, s14, s15
 ; SI-NEXT:    s_or_b32 s11, s13, s11
 ; SI-NEXT:    s_and_b32 s13, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s15, s75, 16
+; SI-NEXT:    s_lshl_b32 s15, s78, 16
 ; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_or_b32 s16, s17, s16
-; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_readlane_b32 s17, v41, 19
-; SI-NEXT:    s_or_b32 s13, s15, s13
-; SI-NEXT:    s_and_b32 s15, s23, 0xffff
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_and_b32 s6, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s17, s17, 16
-; SI-NEXT:    s_or_b32 s15, s20, s15
-; SI-NEXT:    s_and_b32 s20, s21, 0xffff
-; SI-NEXT:    s_or_b32 s6, s17, s6
-; SI-NEXT:    v_readlane_b32 s17, v41, 18
-; SI-NEXT:    s_or_b32 s19, s19, s20
-; SI-NEXT:    s_add_i32 s98, s17, 3
-; SI-NEXT:    v_readlane_b32 s20, v41, 17
-; SI-NEXT:    s_and_b32 s17, s98, 0xffff
-; SI-NEXT:    s_lshl_b32 s20, s20, 16
-; SI-NEXT:    s_or_b32 s17, s20, s17
-; SI-NEXT:    v_readlane_b32 s20, v41, 16
-; SI-NEXT:    s_add_i32 s96, s20, 3
-; SI-NEXT:    v_readlane_b32 s21, v41, 15
-; SI-NEXT:    s_and_b32 s20, s96, 0xffff
-; SI-NEXT:    s_lshl_b32 s21, s21, 16
-; SI-NEXT:    s_or_b32 s20, s21, s20
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    v_readlane_b32 s21, v41, 14
 ; SI-NEXT:    s_and_b32 s18, s18, 0xffff
 ; SI-NEXT:    s_lshl_b32 s21, s21, 16
+; SI-NEXT:    s_or_b32 s13, s15, s13
+; SI-NEXT:    s_and_b32 s15, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s22, s77, 16
 ; SI-NEXT:    s_or_b32 s18, s21, s18
 ; SI-NEXT:    v_readlane_b32 s21, v41, 13
+; SI-NEXT:    s_or_b32 s15, s22, s15
 ; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    v_readlane_b32 s22, v41, 12
 ; SI-NEXT:    s_and_b32 s21, s21, 0xffff
@@ -244257,27 +244244,40 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_or_b32 s27, s28, s27
 ; SI-NEXT:    s_add_i32 s27, s27, 0x30000
 ; SI-NEXT:    s_add_i32 s26, s26, 0x30000
-; SI-NEXT:    s_and_b32 s28, s27, 0xffff0000
+; SI-NEXT:    s_and_b32 s86, s27, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s27, s27, 16
 ; SI-NEXT:    s_add_i32 s25, s25, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s28, 25
-; SI-NEXT:    s_and_b32 s99, s26, 0xffff0000
+; SI-NEXT:    v_writelane_b32 v41, s27, 25
+; SI-NEXT:    s_and_b32 s96, s26, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s26, s26, 16
 ; SI-NEXT:    s_add_i32 s24, s24, 0x30000
 ; SI-NEXT:    v_writelane_b32 v41, s26, 26
-; SI-NEXT:    s_and_b32 s63, s25, 0xffff0000
+; SI-NEXT:    s_and_b32 s99, s25, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s25, s25, 16
 ; SI-NEXT:    s_add_i32 s23, s23, 0x30000
 ; SI-NEXT:    v_writelane_b32 v41, s25, 27
-; SI-NEXT:    s_and_b32 s98, s24, 0xffff0000
+; SI-NEXT:    s_and_b32 s97, s24, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s24, s24, 16
+; SI-NEXT:    s_add_i32 s80, s80, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 0x30000
 ; SI-NEXT:    v_writelane_b32 v41, s24, 28
-; SI-NEXT:    s_and_b32 s62, s23, 0xffff0000
+; SI-NEXT:    s_and_b32 s92, s23, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s23, s23, 16
+; SI-NEXT:    s_and_b32 s4, s80, 0xffff
+; SI-NEXT:    s_add_i32 s84, s84, 3
 ; SI-NEXT:    s_add_i32 s21, s21, 0x30000
 ; SI-NEXT:    v_writelane_b32 v41, s23, 29
-; SI-NEXT:    s_and_b32 s96, s22, 0xffff0000
+; SI-NEXT:    s_and_b32 s94, s22, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s22, s22, 16
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s84, 0xffff
+; SI-NEXT:    s_add_i32 s83, s83, 3
+; SI-NEXT:    s_add_i32 s18, s18, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s22, 30
+; SI-NEXT:    s_and_b32 s95, s21, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s21, s21, 16
+; SI-NEXT:    s_or_b32 s5, s60, s5
+; SI-NEXT:    s_and_b32 s60, s83, 0xffff
 ; SI-NEXT:    s_add_i32 s87, s87, 3
 ; SI-NEXT:    s_add_i32 s59, s59, 3
 ; SI-NEXT:    s_add_i32 s57, s57, 3
@@ -244285,11 +244285,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_add_i32 s45, s45, 3
 ; SI-NEXT:    s_add_i32 s43, s43, 3
 ; SI-NEXT:    s_add_i32 s41, s41, 3
-; SI-NEXT:    s_add_i32 s18, s18, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s22, 30
-; SI-NEXT:    s_and_b32 s31, s21, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s21, s21, 16
+; SI-NEXT:    s_add_i32 s19, s19, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s21, 31
+; SI-NEXT:    s_and_b32 s93, s18, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_or_b32 s76, s61, s60
 ; SI-NEXT:    s_and_b32 s60, s87, 0xffff
+; SI-NEXT:    s_lshl_b32 s61, s73, 16
 ; SI-NEXT:    s_and_b32 s59, s59, 0xffff
 ; SI-NEXT:    s_lshl_b32 s58, s58, 16
 ; SI-NEXT:    s_and_b32 s57, s57, 0xffff
@@ -244302,11 +244304,10 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_lshl_b32 s42, s42, 16
 ; SI-NEXT:    s_and_b32 s41, s41, 0xffff
 ; SI-NEXT:    s_lshl_b32 s40, s40, 16
-; SI-NEXT:    s_add_i32 s20, s20, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s21, 31
-; SI-NEXT:    s_and_b32 s34, s18, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s18, s18, 16
-; SI-NEXT:    s_or_b32 s78, s61, s60
+; SI-NEXT:    s_add_i32 s16, s16, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s18, 32
+; SI-NEXT:    s_lshl_b32 s18, s19, 16
+; SI-NEXT:    s_or_b32 s75, s61, s60
 ; SI-NEXT:    s_or_b32 s58, s58, s59
 ; SI-NEXT:    s_or_b32 s56, s56, s57
 ; SI-NEXT:    s_or_b32 s46, s46, s47
@@ -244314,12 +244315,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_or_b32 s42, s42, s43
 ; SI-NEXT:    s_or_b32 s40, s40, s41
 ; SI-NEXT:    s_add_i32 s6, s6, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s18, 32
-; SI-NEXT:    s_lshl_b32 s18, s20, 16
+; SI-NEXT:    v_writelane_b32 v41, s18, 33
+; SI-NEXT:    s_and_b32 s31, s16, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
 ; SI-NEXT:    s_add_i32 s4, s4, 0x30000
 ; SI-NEXT:    s_add_i32 s5, s5, 0x30000
-; SI-NEXT:    s_add_i32 s79, s79, 0x30000
-; SI-NEXT:    s_add_i32 s78, s78, 0x30000
+; SI-NEXT:    s_add_i32 s76, s76, 0x30000
+; SI-NEXT:    s_add_i32 s75, s75, 0x30000
 ; SI-NEXT:    s_add_i32 s58, s58, 0x30000
 ; SI-NEXT:    s_add_i32 s56, s56, 0x30000
 ; SI-NEXT:    s_add_i32 s46, s46, 0x30000
@@ -244330,296 +244332,293 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
 ; SI-NEXT:    s_add_i32 s12, s12, 0x30000
 ; SI-NEXT:    s_add_i32 s10, s10, 0x30000
 ; SI-NEXT:    s_add_i32 s8, s8, 0x30000
-; SI-NEXT:    s_add_i32 s7, s7, 0x30000
 ; SI-NEXT:    s_add_i32 s9, s9, 0x30000
 ; SI-NEXT:    s_add_i32 s11, s11, 0x30000
 ; SI-NEXT:    s_add_i32 s13, s13, 0x30000
 ; SI-NEXT:    s_add_i32 s15, s15, 0x30000
-; SI-NEXT:    s_add_i32 s19, s19, 0x30000
-; SI-NEXT:    s_add_i32 s16, s16, 0x30000
+; SI-NEXT:    s_add_i32 s20, s20, 0x30000
 ; SI-NEXT:    s_add_i32 s17, s17, 0x30000
-; SI-NEXT:    v_writelane_b32 v41, s18, 33
-; SI-NEXT:    s_and_b32 s30, s6, 0xffff0000
+; SI-NEXT:    s_add_i32 s7, s7, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s16, 34
+; SI-NEXT:    s_and_b32 s34, s6, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshl_b32 s92, s27, 16
-; SI-NEXT:    s_and_b32 s35, s20, 0xffff0000
+; SI-NEXT:    s_and_b32 s30, s19, 0xffff0000
+; SI-NEXT:    v_writelane_b32 v41, s6, 35
+; SI-NEXT:    s_and_b32 s35, s7, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s6, s7, 16
 ; SI-NEXT:    s_and_b32 s36, s17, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s17, s17, 16
-; SI-NEXT:    v_writelane_b32 v41, s6, 34
-; SI-NEXT:    s_and_b32 s37, s16, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s6, s16, 16
-; SI-NEXT:    s_and_b32 s38, s19, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s19, s19, 16
-; SI-NEXT:    s_and_b32 s39, s15, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s20, s15, 16
-; SI-NEXT:    s_and_b32 s48, s13, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s22, s13, 16
-; SI-NEXT:    s_and_b32 s49, s11, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s60, s11, 16
-; SI-NEXT:    s_and_b32 s50, s9, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s24, s9, 16
-; SI-NEXT:    s_and_b32 s51, s7, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s61, s7, 16
-; SI-NEXT:    s_and_b32 s52, s8, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s28, s8, 16
-; SI-NEXT:    s_and_b32 s53, s10, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s72, s10, 16
-; SI-NEXT:    s_and_b32 s54, s12, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s74, s12, 16
-; SI-NEXT:    s_and_b32 s55, s14, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s95, s14, 16
-; SI-NEXT:    s_and_b32 s64, s40, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s81, s40, 16
-; SI-NEXT:    s_and_b32 s65, s42, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s82, s42, 16
-; SI-NEXT:    s_and_b32 s66, s44, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s85, s44, 16
-; SI-NEXT:    s_and_b32 s67, s46, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s86, s46, 16
-; SI-NEXT:    s_and_b32 s68, s56, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s76, s56, 16
-; SI-NEXT:    s_and_b32 s69, s58, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s97, s58, 16
-; SI-NEXT:    s_and_b32 s70, s78, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s78, s78, 16
-; SI-NEXT:    s_and_b32 s71, s79, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s79, s79, 16
-; SI-NEXT:    s_and_b32 s90, s5, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s26, s5, 16
-; SI-NEXT:    s_and_b32 s80, s4, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    v_writelane_b32 v41, s6, 35
+; SI-NEXT:    s_and_b32 s37, s20, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s22, s20, 16
+; SI-NEXT:    s_and_b32 s38, s15, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s24, s15, 16
+; SI-NEXT:    s_and_b32 s39, s13, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s28, s13, 16
+; SI-NEXT:    s_and_b32 s48, s11, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s61, s11, 16
+; SI-NEXT:    s_and_b32 s49, s9, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s89, s9, 16
+; SI-NEXT:    s_and_b32 s50, s8, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s60, s8, 16
+; SI-NEXT:    s_and_b32 s91, s10, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s90, s10, 16
+; SI-NEXT:    s_and_b32 s51, s12, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s70, s12, 16
+; SI-NEXT:    s_and_b32 s52, s14, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s71, s14, 16
+; SI-NEXT:    s_and_b32 s53, s40, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s20, s40, 16
+; SI-NEXT:    s_and_b32 s54, s42, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s81, s42, 16
+; SI-NEXT:    s_and_b32 s55, s44, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s63, s44, 16
+; SI-NEXT:    s_and_b32 s64, s46, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s72, s46, 16
+; SI-NEXT:    s_and_b32 s65, s56, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s82, s56, 16
+; SI-NEXT:    s_and_b32 s66, s58, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s74, s58, 16
+; SI-NEXT:    s_and_b32 s67, s75, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s75, s75, 16
+; SI-NEXT:    s_and_b32 s68, s76, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s76, s76, 16
+; SI-NEXT:    s_and_b32 s69, s5, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s85, s5, 16
+; SI-NEXT:    s_and_b32 s26, s4, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s5, s4, 16
+; SI-NEXT:    v_writelane_b32 v41, s6, 36
 ; SI-NEXT:  .LBB107_5: ; %end
-; SI-NEXT:    v_readlane_b32 s5, v41, 25
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s86
+; SI-NEXT:    v_readlane_b32 s4, v41, 25
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s92
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s99
-; SI-NEXT:    v_readlane_b32 s5, v41, 26
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s96
+; SI-NEXT:    v_readlane_b32 s4, v41, 26
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s63
-; SI-NEXT:    v_readlane_b32 s5, v41, 27
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s99
+; SI-NEXT:    v_readlane_b32 s4, v41, 27
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s98
-; SI-NEXT:    v_readlane_b32 s5, v41, 28
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s97
+; SI-NEXT:    v_readlane_b32 s4, v41, 28
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s62
-; SI-NEXT:    v_readlane_b32 s5, v41, 29
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s92
+; SI-NEXT:    v_readlane_b32 s4, v41, 29
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s96
-; SI-NEXT:    v_readlane_b32 s5, v41, 30
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s94
+; SI-NEXT:    v_readlane_b32 s4, v41, 30
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s31
-; SI-NEXT:    v_readlane_b32 s5, v41, 31
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s95
+; SI-NEXT:    v_readlane_b32 s4, v41, 31
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s34
-; SI-NEXT:    v_readlane_b32 s5, v41, 32
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s93
+; SI-NEXT:    v_readlane_b32 s4, v41, 32
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
-; SI-NEXT:    v_readlane_b32 s5, v41, 33
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s30
+; SI-NEXT:    v_readlane_b32 s4, v41, 33
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s36
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s31
+; SI-NEXT:    v_readlane_b32 s4, v41, 34
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s30
-; SI-NEXT:    v_readlane_b32 s5, v41, 34
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s34
+; SI-NEXT:    v_readlane_b32 s4, v41, 35
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s37
-; SI-NEXT:    v_readlane_b32 s5, v41, 35
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s35
+; SI-NEXT:    v_readlane_b32 s4, v41, 36
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s38
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s36
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s17
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s39
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s37
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s22
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s48
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s38
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s24
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s49
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s39
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s60
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s28
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s50
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s48
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s61
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s51
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s49
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s61
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s89
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s52
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s50
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s28
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s60
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s53
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s91
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s72
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s90
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s54
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s51
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s74
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s70
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s55
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s52
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s95
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s71
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s64
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s53
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s81
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s20
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s65
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s54
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s82
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s81
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s66
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s55
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s85
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s63
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s67
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s64
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s86
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s72
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s68
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s65
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s76
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s82
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s69
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s66
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s97
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s74
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s70
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s67
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s78
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s75
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s71
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s68
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s79
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s76
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s90
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s69
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s85
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s80
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s26
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s5
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -245124,94 +245123,94 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:88
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:88
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v3
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v7
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v22
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, v3
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v17
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v50
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v29
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v42
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v47
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v51
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v46
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v38
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v45
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v46
 ; SI-NEXT:    v_cvt_f16_f32_e32 v42, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v41
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v45
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v58
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v62
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v32
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v33
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v36
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v60
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v37
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v60
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v61
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v62
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:120
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v63
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, v39
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v6
 ; SI-NEXT:    s_waitcnt vmcnt(5)
@@ -245220,15 +245219,15 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v8
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v10
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v32
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v37
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:124
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:128
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:132
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v32
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v46
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v46, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v56
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v6
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -245241,18 +245240,17 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v59, v29
 ; SI-NEXT:    v_mov_b32_e32 v29, v27
 ; SI-NEXT:    v_mov_b32_e32 v57, v23
-; SI-NEXT:    v_mov_b32_e32 v60, v4
-; SI-NEXT:    v_mov_b32_e32 v62, v13
+; SI-NEXT:    v_mov_b32_e32 v60, v3
+; SI-NEXT:    v_mov_b32_e32 v62, v4
+; SI-NEXT:    v_mov_b32_e32 v63, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB108_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v61
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
@@ -245282,20 +245280,17 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v14, v14, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v33, v33, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v56
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v12
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
@@ -245305,106 +245300,111 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
 ; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
 ; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
 ; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_or_b32_e32 v56, v37, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
 ; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_or_b32_e32 v11, v11, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_or_b32_e32 v16, v16, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v52
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v4
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_or_b32_e32 v61, v3, v37
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v58
+; SI-NEXT:    v_or_b32_e32 v11, v11, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v17
+; SI-NEXT:    v_or_b32_e32 v16, v16, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v31, v31, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v20
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v13
 ; SI-NEXT:    v_or_b32_e32 v18, v18, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v53
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v52
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v37
 ; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_or_b32_e32 v52, v37, v39
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
 ; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v37
 ; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v56, 0x38000000, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v56
 ; SI-NEXT:    v_or_b32_e32 v55, v37, v39
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v44
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v57
+; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
 ; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v37
 ; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v44
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
 ; SI-NEXT:    v_or_b32_e32 v43, v37, v39
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v63
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v57
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_add_f32_e32 v47, 0x38000000, v47
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
@@ -245412,22 +245412,27 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
+; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v45
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
 ; SI-NEXT:    v_cvt_f32_f16_e32 v46, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT:    v_add_f32_e32 v47, 0x38000000, v47
 ; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
 ; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
 ; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
+; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
 ; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
 ; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v47
 ; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
 ; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
 ; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
-; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
 ; SI-NEXT:    v_add_f32_e32 v45, 0x38000000, v45
@@ -245438,131 +245443,125 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v45
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v47
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
 ; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; SI-NEXT:    v_or_b32_e32 v38, v38, v47
 ; SI-NEXT:    v_or_b32_e32 v54, v54, v42
-; SI-NEXT:    v_or_b32_e32 v49, v49, v51
 ; SI-NEXT:    v_or_b32_e32 v45, v45, v50
 ; SI-NEXT:    v_or_b32_e32 v41, v41, v30
 ; SI-NEXT:    v_or_b32_e32 v46, v46, v32
+; SI-NEXT:    v_alignbit_b32 v47, v16, v47, 16
 ; SI-NEXT:    v_alignbit_b32 v42, v11, v42, 16
-; SI-NEXT:    v_alignbit_b32 v51, v56, v51, 16
 ; SI-NEXT:    v_alignbit_b32 v50, v14, v50, 16
 ; SI-NEXT:    v_alignbit_b32 v30, v7, v30, 16
 ; SI-NEXT:    v_alignbit_b32 v32, v6, v32, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v4
-; SI-NEXT:    v_or_b32_e32 v4, v37, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v39
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v3
+; SI-NEXT:    v_or_b32_e32 v3, v37, v34
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v43, v34, 16
-; SI-NEXT:    v_alignbit_b32 v63, v55, v37, 16
+; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_or_b32_e32 v4, v48, v37
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v3, v39, v1
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v1, v55, v1, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v4
-; SI-NEXT:    v_or_b32_e32 v4, v39, v5
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v3
+; SI-NEXT:    v_or_b32_e32 v3, v37, v5
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v49
+; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v62
 ; SI-NEXT:    v_alignbit_b32 v5, v52, v5, 16
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v4
-; SI-NEXT:    v_or_b32_e32 v4, v39, v9
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_or_b32_e32 v3, v39, v9
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
+; SI-NEXT:    v_or_b32_e32 v62, v56, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v60
 ; SI-NEXT:    v_alignbit_b32 v9, v2, v9, 16
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_or_b32_e32 v62, v48, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v60
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT:    v_add_f32_e32 v56, 0x38000000, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; SI-NEXT:    v_or_b32_e32 v60, v48, v39
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_or_b32_e32 v57, v48, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v59
+; SI-NEXT:    v_or_b32_e32 v60, v56, v39
+; SI-NEXT:    v_add_f32_e32 v56, 0x38000000, v57
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v56
+; SI-NEXT:    v_or_b32_e32 v57, v56, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v59
 ; SI-NEXT:    v_alignbit_b32 v26, v31, v26, 16
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT:    v_add_f32_e32 v56, 0x38000000, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v23
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v4
-; SI-NEXT:    v_alignbit_b32 v4, v18, v13, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v19, v39, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v24, v23, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_or_b32_e32 v59, v48, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v47
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v58
-; SI-NEXT:    v_or_b32_e32 v38, v38, v48
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v21, v27, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    v_add_f32_e32 v47, 0x38000000, v47
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v47
-; SI-NEXT:    v_or_b32_e32 v58, v47, v35
-; SI-NEXT:    v_alignbit_b32 v47, v16, v48, 16
+; SI-NEXT:    v_or_b32_e32 v59, v56, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v63
+; SI-NEXT:    v_add_f32_e32 v56, 0x38000000, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v56
+; SI-NEXT:    v_or_b32_e32 v63, v56, v35
 ; SI-NEXT:    v_alignbit_b32 v35, v33, v35, 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v3
+; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
+; SI-NEXT:    v_or_b32_e32 v3, v49, v51
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v3, v19, v39, 16
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v3, v24, v23, 16
+; SI-NEXT:    v_alignbit_b32 v49, v18, v37, 16
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v3, v21, v27, 16
+; SI-NEXT:    v_alignbit_b32 v51, v61, v51, 16
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB108_2: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v37, 0xffff, v4
+; SI-NEXT:    v_and_b32_e32 v37, 0xffff, v3
 ; SI-NEXT:    v_or_b32_e32 v34, v37, v34
 ; SI-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -245571,52 +245570,58 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v34, v34, v37
 ; SI-NEXT:    v_add_i32_e32 v37, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v34, v37, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v63
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v34, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v34, v34, v37
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v34, v37, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v34, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v34, v1
+; SI-NEXT:    v_add_i32_e32 v34, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v34, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v34, 0xffff, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v40
-; SI-NEXT:    v_or_b32_e32 v34, v34, v37
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v34, v37, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v34, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v5, v34, v5
-; SI-NEXT:    v_add_i32_e32 v34, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v5, v34, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v52
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v53
-; SI-NEXT:    v_or_b32_e32 v5, v5, v34
-; SI-NEXT:    v_add_i32_e32 v34, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v5, v34, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v5, v5, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v40
+; SI-NEXT:    v_or_b32_e32 v1, v1, v34
+; SI-NEXT:    v_add_i32_e32 v34, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v34, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v62
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v18
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v53
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v48
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v62
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v13
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v60
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -245642,7 +245647,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v29
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -245656,7 +245661,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v59
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -245690,24 +245695,28 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v58
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v49
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v51
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v56
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v61
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v61
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v58
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
@@ -246062,27 +246071,29 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v5
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v9
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v15
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v17, v24
@@ -246101,22 +246112,19 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v46
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v47
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v56
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v58
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v59
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v61
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v62
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v63
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v34
 ; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v34
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v37
 ; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v48
@@ -246296,9 +246304,9 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v45
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_or_b32_e32 v60, v48, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
@@ -246306,153 +246314,153 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v56
 ; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
+; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
 ; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
 ; SI-NEXT:    v_or_b32_e32 v56, v54, v48
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
+; SI-NEXT:    v_or_b32_e32 v45, v40, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v7
+; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; SI-NEXT:    v_or_b32_e32 v7, v41, v55
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; SI-NEXT:    v_or_b32_e32 v41, v41, v55
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
 ; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT:    v_alignbit_b32 v63, v46, v51, 16
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v45
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_or_b32_e32 v45, v40, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v43
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
+; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
 ; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_or_b32_e32 v7, v7, v40
 ; SI-NEXT:    v_or_b32_e32 v14, v14, v13
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v17
 ; SI-NEXT:    v_or_b32_e32 v34, v34, v21
 ; SI-NEXT:    v_alignbit_b32 v4, v57, v4, 16
+; SI-NEXT:    v_alignbit_b32 v63, v46, v51, 16
 ; SI-NEXT:    v_alignbit_b32 v62, v29, v48, 16
 ; SI-NEXT:    v_alignbit_b32 v61, v52, v54, 16
 ; SI-NEXT:    v_alignbit_b32 v44, v49, v55, 16
-; SI-NEXT:    v_alignbit_b32 v43, v38, v40, 16
 ; SI-NEXT:    v_alignbit_b32 v13, v32, v13, 16
 ; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
 ; SI-NEXT:    v_alignbit_b32 v21, v2, v21, 16
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_or_b32_e32 v41, v41, v10
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v7, v7, v40
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v7, v41, v10
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v10, v35, v10, 16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v7
 ; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_or_b32_e32 v41, v41, v20
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v7, v41, v20
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v20, v31, v20, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v7
 ; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_or_b32_e32 v41, v41, v28
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v7, v41, v28
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v28, v15, v28, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v7
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_or_b32_e32 v41, v41, v27
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v7
+; SI-NEXT:    v_or_b32_e32 v7, v41, v27
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v27, v11, v27, 16
 ; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v42
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_alignbit_b32 v27, v11, v27, 16
-; SI-NEXT:    v_or_b32_e32 v41, v41, v26
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v26, v8, v26, 16
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
-; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_or_b32_e32 v41, v42, v24
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v43
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v7, v41, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v1
-; SI-NEXT:    v_alignbit_b32 v24, v5, v24, 16
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
 ; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_or_b32_e32 v1, v41, v37
-; SI-NEXT:    v_mov_b32_e32 v51, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v43, v42, v24
+; SI-NEXT:    v_alignbit_b32 v26, v8, v26, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v7, v41, v37
+; SI-NEXT:    v_mov_b32_e32 v51, v7
+; SI-NEXT:    v_alignbit_b32 v7, v38, v40, 16
+; SI-NEXT:    v_alignbit_b32 v24, v5, v24, 16
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_alignbit_b32 v37, v1, v37, 16
 ; SI-NEXT:  .LBB109_3: ; %end
 ; SI-NEXT:    v_and_b32_e32 v48, 0xffff, v60
@@ -246504,6 +246512,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -246521,8 +246530,9 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v4, v29, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v43
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
@@ -246533,7 +246543,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -246559,7 +246569,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -246585,7 +246595,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -246599,7 +246609,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v27
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -246613,7 +246623,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v26
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -246627,10 +246637,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
@@ -248333,9 +248341,10 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:80
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:12
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
@@ -248355,13 +248364,13 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16
 ; SI-NEXT:    v_mov_b32_e32 v42, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
@@ -248372,14 +248381,15 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v41
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
@@ -248388,35 +248398,35 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, s16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, s26
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, s27
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, s29
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, s17
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, s29
+; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v3
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, s18
-; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, s27
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v42
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, s19
-; SI-NEXT:    v_mov_b32_e32 v2, v5
+; SI-NEXT:    v_mov_b32_e32 v2, v9
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, s20
-; SI-NEXT:    v_mov_b32_e32 v3, v7
+; SI-NEXT:    v_mov_b32_e32 v3, v10
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v6
@@ -248451,56 +248461,56 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, s25
-; SI-NEXT:    v_mov_b32_e32 v62, v28
+; SI-NEXT:    v_mov_b32_e32 v60, v29
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v11
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, s28
-; SI-NEXT:    v_mov_b32_e32 v63, v29
+; SI-NEXT:    v_mov_b32_e32 v61, v30
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v12
-; SI-NEXT:    v_mov_b32_e32 v60, v30
-; SI-NEXT:    v_mov_b32_e32 v61, v32
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v62
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v29
-; SI-NEXT:    v_mov_b32_e32 v32, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v31
-; SI-NEXT:    v_mov_b32_e32 v31, v46
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v53
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, v52
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v51
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v50
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v48
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v25
 ; SI-NEXT:    v_mov_b32_e32 v25, v44
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v52
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v39
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v50
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v37
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v19
@@ -248511,16 +248521,11 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v26
-; SI-NEXT:    v_mov_b32_e32 v26, v45
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v35
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v35
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, v33
 ; SI-NEXT:    s_branch .LBB111_3
@@ -248529,53 +248534,33 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; kill: killed $vgpr41
-; SI-NEXT:    v_mov_b32_e32 v61, v32
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; kill: killed $vgpr41
-; SI-NEXT:    v_mov_b32_e32 v32, v31
+; SI-NEXT:    v_mov_b32_e32 v61, v30
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; kill: killed $vgpr41
-; SI-NEXT:    v_mov_b32_e32 v31, v46
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; kill: killed $vgpr41
-; SI-NEXT:    v_mov_b32_e32 v60, v30
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; kill: killed $vgpr41
-; SI-NEXT:    v_mov_b32_e32 v63, v29
+; SI-NEXT:    v_mov_b32_e32 v60, v29
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; kill: killed $vgpr41
-; SI-NEXT:    v_mov_b32_e32 v62, v28
+; SI-NEXT:    v_mov_b32_e32 v3, v10
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; kill: killed $vgpr41
-; SI-NEXT:    v_mov_b32_e32 v3, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v9
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; kill: killed $vgpr41
-; SI-NEXT:    v_mov_b32_e32 v2, v5
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; kill: killed $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; kill: killed $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr24
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -248612,6 +248597,22 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    ; kill: killed $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; kill: killed $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; kill: killed $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; kill: killed $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; kill: killed $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; kill: killed $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; kill: killed $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; kill: killed $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; kill: killed $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr4
@@ -248648,10 +248649,10 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v19, v1
 ; SI-NEXT:    s_cbranch_vccnz .LBB111_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v3
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v2
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v3
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v2
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, s16
 ; SI-NEXT:    s_add_i32 s17, s17, 3
@@ -248706,17 +248707,16 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v54, vcc, 3, v54
 ; SI-NEXT:    v_add_i32_e32 v55, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v40, vcc, 3, v40
-; SI-NEXT:    v_add_i32_e32 v58, vcc, 3, v59
-; SI-NEXT:    v_add_i32_e32 v59, vcc, 3, v31
-; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v32
-; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v61
-; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v60
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v63
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v62
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 3, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v58, vcc, 3, v62
+; SI-NEXT:    v_add_i32_e32 v59, vcc, 3, v63
+; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v31
+; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
+; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v61
+; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v60
 ; SI-NEXT:    s_add_i32 s29, s29, 3
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_add_i32_e32 v20, vcc, 3, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; SI-NEXT:    s_add_i32 s28, s28, 3
 ; SI-NEXT:    s_add_i32 s27, s27, 3
 ; SI-NEXT:    s_add_i32 s26, s26, 3
@@ -248732,72 +248732,75 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, v33
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 3, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 3, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v27, vcc, 3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v25, vcc, 3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 3, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 3, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v26, vcc, 3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 3, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v21, vcc, 3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 3, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
@@ -248807,6 +248810,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v3
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v49
@@ -248822,6 +248827,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, s27
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v8
@@ -248879,7 +248885,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, s27
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v34
@@ -248940,7 +248945,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v59
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v25
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index f219402aa5ebf..1056c66720b25 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -3743,7 +3743,7 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s16, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s18, 0
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB25_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s3, 24
@@ -3754,11 +3754,11 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s18, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s16, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s0, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[4:5], s[2:3], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[6:7], s[0:1], 24
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s18
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB25_3
 ; GFX11-TRUE16-NEXT:  .LBB25_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s1, s1, 3
@@ -3775,12 +3775,12 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s18, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s16, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s0, 8
 ; GFX11-TRUE16-NEXT:  .LBB25_3: ; %end
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s16
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, s6
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, s15
@@ -3796,8 +3796,8 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, s8
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB25_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14_lo16
@@ -3814,7 +3814,7 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s17, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s18, 0
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB25_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s3, 24
@@ -3826,10 +3826,10 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s1, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s18, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s17, s0, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[2:3], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[0:1], 24
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s17
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s18
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB25_3
 ; GFX11-FAKE16-NEXT:  .LBB25_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    s_add_i32 s1, s1, 3
@@ -3847,10 +3847,10 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s1, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s18, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s17, s0, 8
 ; GFX11-FAKE16-NEXT:  .LBB25_3: ; %end
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s17
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s6
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s15
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s13
@@ -3860,7 +3860,7 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v15, s8
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-FAKE16-NEXT:  .LBB25_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr17
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr16
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr15
@@ -12472,7 +12472,7 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s16, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s18, 0
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB69_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s3, 24
@@ -12483,11 +12483,11 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s18, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s16, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s0, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[4:5], s[2:3], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[6:7], s[0:1], 24
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s18
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB69_3
 ; GFX11-TRUE16-NEXT:  .LBB69_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, 3
@@ -12504,12 +12504,12 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s18, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s16, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s0, 8
 ; GFX11-TRUE16-NEXT:  .LBB69_3: ; %end
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s16
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, s6
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, s15
@@ -12525,8 +12525,8 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, s8
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB69_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14_lo16
@@ -12543,7 +12543,7 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s17, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s18, 0
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB69_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s3, 24
@@ -12555,10 +12555,10 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s1, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s18, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s17, s0, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[2:3], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[0:1], 24
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s17
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s18
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB69_3
 ; GFX11-FAKE16-NEXT:  .LBB69_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, 3
@@ -12576,10 +12576,10 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s1, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s18, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s17, s0, 8
 ; GFX11-FAKE16-NEXT:  .LBB69_3: ; %end
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s17
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s6
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s15
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s13
@@ -12589,7 +12589,7 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v15, s8
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-FAKE16-NEXT:  .LBB69_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr17
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr16
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr15
@@ -19198,74 +19198,78 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) {
 ; VI-LABEL: bitcast_v8i16_to_v16i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v17, v3
-; VI-NEXT:    v_mov_b32_e32 v16, v2
-; VI-NEXT:    v_mov_b32_e32 v19, v1
-; VI-NEXT:    v_mov_b32_e32 v18, v0
+; VI-NEXT:    v_mov_b32_e32 v19, v3
+; VI-NEXT:    v_mov_b32_e32 v18, v2
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT:    ; implicit-def: $vgpr16
+; VI-NEXT:    ; implicit-def: $vgpr20
 ; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    ; implicit-def: $vgpr17
 ; VI-NEXT:    ; implicit-def: $vgpr5
 ; VI-NEXT:    ; implicit-def: $vgpr7
+; VI-NEXT:    ; implicit-def: $vgpr8
 ; VI-NEXT:    ; implicit-def: $vgpr9
-; VI-NEXT:    ; implicit-def: $vgpr11
+; VI-NEXT:    ; implicit-def: $vgpr21
 ; VI-NEXT:    ; implicit-def: $vgpr13
 ; VI-NEXT:    ; implicit-def: $vgpr15
+; VI-NEXT:    ; implicit-def: $vgpr11
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB96_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v0
+; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[18:19]
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v16, v0
+; VI-NEXT:    v_mov_b32_e32 v17, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v18
+; VI-NEXT:    v_mov_b32_e32 v21, v19
+; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    ; implicit-def: $vgpr19
 ; VI-NEXT:  .LBB96_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB96_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_mov_b32_e32 v3, 3
-; VI-NEXT:    v_add_u16_sdwa v6, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v20, 3, v19
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; VI-NEXT:    v_add_u16_sdwa v2, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v14, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v10, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v1, v20, v0
-; VI-NEXT:    v_add_u16_e32 v19, 3, v18
+; VI-NEXT:    v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v14, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v10, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_e32 v17, 3, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
+; VI-NEXT:    v_add_u16_e32 v16, 3, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; VI-NEXT:    v_add_u16_e32 v21, 3, v17
+; VI-NEXT:    v_add_u16_e32 v21, 3, v19
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
-; VI-NEXT:    v_add_u16_e32 v17, 3, v16
+; VI-NEXT:    v_add_u16_e32 v8, 3, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
-; VI-NEXT:    v_or_b32_e32 v0, v19, v0
-; VI-NEXT:    v_or_b32_e32 v8, v21, v4
-; VI-NEXT:    v_or_b32_e32 v7, v17, v3
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[7:8]
+; VI-NEXT:    v_or_b32_e32 v1, v17, v1
+; VI-NEXT:    v_or_b32_e32 v0, v16, v0
+; VI-NEXT:    v_or_b32_e32 v19, v21, v4
+; VI-NEXT:    v_or_b32_e32 v18, v8, v3
+; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[18:19]
 ; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v0
 ; VI-NEXT:    v_bfe_u32 v15, v14, 8, 8
 ; VI-NEXT:    v_bfe_u32 v7, v6, 8, 8
-; VI-NEXT:    v_mov_b32_e32 v18, v19
-; VI-NEXT:    v_mov_b32_e32 v19, v20
-; VI-NEXT:    v_mov_b32_e32 v16, v17
-; VI-NEXT:    v_mov_b32_e32 v17, v21
 ; VI-NEXT:  .LBB96_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v0, v18
-; VI-NEXT:    v_mov_b32_e32 v4, v19
-; VI-NEXT:    v_mov_b32_e32 v8, v16
-; VI-NEXT:    v_mov_b32_e32 v12, v17
+; VI-NEXT:    v_mov_b32_e32 v0, v16
+; VI-NEXT:    v_mov_b32_e32 v1, v20
+; VI-NEXT:    v_mov_b32_e32 v4, v17
+; VI-NEXT:    v_mov_b32_e32 v12, v21
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v8i16_to_v16i8:
@@ -24319,13 +24323,13 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.h
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[2:3]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v11.h
 ; GFX11-TRUE16-NEXT:  .LBB108_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB108_4
@@ -24409,7 +24413,6 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
 ; GFX11-TRUE16-NEXT:  .LBB108_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
@@ -25857,9 +25860,9 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    v_readfirstlane_b32 s8, v1
+; SI-NEXT:    v_readfirstlane_b32 s9, v1
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s9, v0
+; SI-NEXT:    v_readfirstlane_b32 s11, v0
 ; SI-NEXT:    s_cbranch_scc0 .LBB111_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -25873,11 +25876,11 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
 ; SI-NEXT:    s_and_b32 s4, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s21, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_lshl_b32 s10, s4, 16
+; SI-NEXT:    s_lshl_b32 s8, s4, 16
 ; SI-NEXT:    s_and_b32 s4, s22, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s23, 24
-; SI-NEXT:    s_or_b32 s11, s5, s4
+; SI-NEXT:    s_or_b32 s10, s5, s4
 ; SI-NEXT:    s_and_b32 s4, s24, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s25, 24
@@ -25890,20 +25893,20 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
 ; SI-NEXT:    s_lshl_b32 s5, s29, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_lshl_b32 s14, s4, 16
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
+; SI-NEXT:    s_and_b32 s4, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s5, s8, 24
+; SI-NEXT:    s_lshl_b32 s5, s9, 24
 ; SI-NEXT:    s_or_b32 s15, s5, s4
 ; SI-NEXT:    s_cbranch_execnz .LBB111_3
 ; SI-NEXT:  .LBB111_2: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s28, s28, 3
 ; SI-NEXT:    s_and_b32 s4, s28, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s29, 8
-; SI-NEXT:    s_add_i32 s9, s9, 3
+; SI-NEXT:    s_add_i32 s11, s11, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s9, 0xff
+; SI-NEXT:    s_and_b32 s6, s11, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s8, 24
+; SI-NEXT:    s_lshl_b32 s5, s9, 24
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
@@ -25950,8 +25953,8 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
 ; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
 ; SI-NEXT:    s_and_b32 s7, s6, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s11, s8, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s10, s8, 16
+; SI-NEXT:    s_and_b32 s10, s8, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; SI-NEXT:    s_and_b32 s13, s5, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s12, s5, 16
 ; SI-NEXT:    s_and_b32 s15, s4, 0xffff0000
@@ -25959,8 +25962,8 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
 ; SI-NEXT:  .LBB111_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s10
-; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s10
 ; SI-NEXT:    v_mov_b32_e32 v4, s12
 ; SI-NEXT:    v_mov_b32_e32 v5, s13
 ; SI-NEXT:    v_mov_b32_e32 v6, s14
@@ -25969,8 +25972,8 @@ define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i3
 ; SI-NEXT:  .LBB111_4:
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; implicit-def: $sgpr7
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr14

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 12c98939a2368..8dc00701dcfd6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -5622,7 +5622,7 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB25_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s19, 24
@@ -5642,14 +5642,14 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s2, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[4:5], s[18:19], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[6:7], s[16:17], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[8:9], s[2:3], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[10:11], s[0:1], 24
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s43
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB25_3
 ; GFX11-TRUE16-NEXT:  .LBB25_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s1, s1, 3
@@ -5681,16 +5681,16 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s2, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-TRUE16-NEXT:  .LBB25_3: ; %end
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s46
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s44
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, s10
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, s44
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, s43
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, s42
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, s41
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, s2
@@ -5719,10 +5719,10 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, s12
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB25_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40_lo16
@@ -5749,7 +5749,7 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s44, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB25_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s19, 24
@@ -5770,13 +5770,13 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s1, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[18:19], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[16:17], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[8:9], s[2:3], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[10:11], s[0:1], 24
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s44
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB25_3
 ; GFX11-FAKE16-NEXT:  .LBB25_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    s_add_i32 s1, s1, 3
@@ -5809,12 +5809,12 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s1, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-FAKE16-NEXT:  .LBB25_3: ; %end
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s46
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v3, s10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s45
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s44 :: v_dual_mov_b32 v3, s10
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s43
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s41
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s40
@@ -5831,8 +5831,8 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-FAKE16-NEXT:  .LBB25_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr44
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
@@ -19875,7 +19875,7 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB69_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s19, 24
@@ -19895,14 +19895,14 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s2, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[4:5], s[18:19], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[6:7], s[16:17], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[8:9], s[2:3], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[10:11], s[0:1], 24
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s43
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB69_3
 ; GFX11-TRUE16-NEXT:  .LBB69_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, 3
@@ -19934,16 +19934,16 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s2, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-TRUE16-NEXT:  .LBB69_3: ; %end
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s46
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s44
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, s10
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, s44
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, s43
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, s42
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, s41
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, s2
@@ -19972,10 +19972,10 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, s12
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB69_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40_lo16
@@ -20002,7 +20002,7 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s44, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB69_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s19, 24
@@ -20023,13 +20023,13 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s1, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[18:19], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[16:17], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[8:9], s[2:3], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[10:11], s[0:1], 24
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s44
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB69_3
 ; GFX11-FAKE16-NEXT:  .LBB69_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, 3
@@ -20062,12 +20062,12 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s1, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s0, 8
 ; GFX11-FAKE16-NEXT:  .LBB69_3: ; %end
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s46
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v3, s10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s45
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s44 :: v_dual_mov_b32 v3, s10
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s43
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s41
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s40
@@ -20084,8 +20084,8 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-FAKE16-NEXT:  .LBB69_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr44
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
@@ -31362,32 +31362,36 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) {
 ; VI-LABEL: bitcast_v16i16_to_v32i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v33, v5
-; VI-NEXT:    v_mov_b32_e32 v32, v4
-; VI-NEXT:    v_mov_b32_e32 v35, v3
-; VI-NEXT:    v_mov_b32_e32 v34, v2
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v33
-; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v32
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v35
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v34
+; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v0
+; VI-NEXT:    ; implicit-def: $vgpr50
 ; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr5
+; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr8
 ; VI-NEXT:    ; implicit-def: $vgpr9
+; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr13
 ; VI-NEXT:    ; implicit-def: $vgpr15
+; VI-NEXT:    ; implicit-def: $vgpr16
 ; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    ; implicit-def: $vgpr21
 ; VI-NEXT:    ; implicit-def: $vgpr23
+; VI-NEXT:    ; implicit-def: $vgpr24
 ; VI-NEXT:    ; implicit-def: $vgpr25
+; VI-NEXT:    ; implicit-def: $vgpr51
 ; VI-NEXT:    ; implicit-def: $vgpr29
 ; VI-NEXT:    ; implicit-def: $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    ; implicit-def: $vgpr11
 ; VI-NEXT:    ; implicit-def: $vgpr19
 ; VI-NEXT:    ; implicit-def: $vgpr27
@@ -31398,92 +31402,97 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 24, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 24, v33
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v33
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v32
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 24, v35
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v35
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT:    v_lshrrev_b64 v[27:28], 24, v[6:7]
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[32:33]
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[34:35]
-; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 24, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 8, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v0
+; VI-NEXT:    v_lshrrev_b64 v[27:28], 24, v[6:7]
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[4:5]
+; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[2:3]
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v50, v0
+; VI-NEXT:    v_mov_b32_e32 v48, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NEXT:    v_mov_b32_e32 v35, v3
+; VI-NEXT:    v_mov_b32_e32 v16, v4
+; VI-NEXT:    v_mov_b32_e32 v49, v5
+; VI-NEXT:    v_mov_b32_e32 v24, v6
+; VI-NEXT:    v_mov_b32_e32 v51, v7
+; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    ; implicit-def: $vgpr5
+; VI-NEXT:    ; implicit-def: $vgpr7
 ; VI-NEXT:  .LBB96_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB96_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v13, 3
-; VI-NEXT:    v_add_u16_sdwa v2, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v15, 3, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; VI-NEXT:    v_add_u16_sdwa v14, v35, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v3, v15, v0
-; VI-NEXT:    v_add_u16_e32 v16, 3, v35
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v14
-; VI-NEXT:    v_add_u16_sdwa v10, v34, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v12, v16, v0
-; VI-NEXT:    v_add_u16_e32 v17, 3, v34
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v10
-; VI-NEXT:    v_add_u16_sdwa v22, v33, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v11, v17, v0
-; VI-NEXT:    v_add_u16_e32 v19, 3, v33
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v22
-; VI-NEXT:    v_add_u16_sdwa v18, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v36, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v9, v19, v0
-; VI-NEXT:    v_add_u16_e32 v20, 3, v32
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v18
-; VI-NEXT:    v_add_u16_sdwa v30, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v5, 3, v1
+; VI-NEXT:    v_mov_b32_e32 v9, 3
+; VI-NEXT:    v_add_u16_sdwa v36, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v32, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v14, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v22, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v18, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v30, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v26, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_e32 v48, 3, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v36
-; VI-NEXT:    v_or_b32_e32 v8, v20, v0
-; VI-NEXT:    v_add_u16_e32 v23, 3, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v30
-; VI-NEXT:    v_add_u16_sdwa v26, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v4, v5, v1
-; VI-NEXT:    v_or_b32_e32 v1, v23, v0
-; VI-NEXT:    v_add_u16_e32 v7, 3, v6
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v26
-; VI-NEXT:    v_or_b32_e32 v0, v7, v0
-; VI-NEXT:    v_lshrrev_b64 v[27:28], 24, v[0:1]
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; VI-NEXT:    v_mov_b32_e32 v1, v5
-; VI-NEXT:    v_mov_b32_e32 v32, v20
-; VI-NEXT:    v_mov_b32_e32 v33, v19
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[8:9]
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[11:12]
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v3
-; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[3:4]
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v0
-; VI-NEXT:    v_mov_b32_e32 v0, v15
-; VI-NEXT:    v_mov_b32_e32 v34, v17
-; VI-NEXT:    v_mov_b32_e32 v35, v16
-; VI-NEXT:    v_mov_b32_e32 v6, v7
-; VI-NEXT:    v_mov_b32_e32 v7, v23
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v8
+; VI-NEXT:    v_add_u16_e32 v50, 3, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
+; VI-NEXT:    v_add_u16_e32 v35, 3, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v14
+; VI-NEXT:    v_add_u16_e32 v8, 3, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
+; VI-NEXT:    v_add_u16_e32 v49, 3, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v22
+; VI-NEXT:    v_add_u16_e32 v16, 3, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; VI-NEXT:    v_add_u16_e32 v51, 3, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v30
+; VI-NEXT:    v_add_u16_e32 v24, 3, v6
+; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v26
+; VI-NEXT:    v_or_b32_e32 v1, v48, v1
+; VI-NEXT:    v_or_b32_e32 v0, v50, v0
+; VI-NEXT:    v_or_b32_e32 v3, v35, v3
+; VI-NEXT:    v_or_b32_e32 v2, v8, v2
+; VI-NEXT:    v_or_b32_e32 v5, v49, v5
+; VI-NEXT:    v_or_b32_e32 v4, v16, v4
+; VI-NEXT:    v_or_b32_e32 v7, v51, v7
+; VI-NEXT:    v_or_b32_e32 v6, v24, v6
+; VI-NEXT:    v_lshrrev_b64 v[27:28], 24, v[6:7]
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[4:5]
+; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[2:3]
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[0:1]
+; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v0
 ; VI-NEXT:    v_bfe_u32 v31, v30, 8, 8
 ; VI-NEXT:    v_bfe_u32 v23, v22, 8, 8
 ; VI-NEXT:    v_bfe_u32 v15, v14, 8, 8
-; VI-NEXT:    v_bfe_u32 v37, v36, 8, 8
+; VI-NEXT:    v_bfe_u32 v39, v36, 8, 8
 ; VI-NEXT:  .LBB96_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v4, v1
-; VI-NEXT:    v_mov_b32_e32 v8, v34
-; VI-NEXT:    v_mov_b32_e32 v12, v35
-; VI-NEXT:    v_mov_b32_e32 v16, v32
-; VI-NEXT:    v_mov_b32_e32 v20, v33
-; VI-NEXT:    v_mov_b32_e32 v24, v6
-; VI-NEXT:    v_mov_b32_e32 v28, v7
+; VI-NEXT:    v_mov_b32_e32 v0, v50
 ; VI-NEXT:    v_mov_b32_e32 v1, v38
+; VI-NEXT:    v_mov_b32_e32 v2, v32
+; VI-NEXT:    v_mov_b32_e32 v3, v33
+; VI-NEXT:    v_mov_b32_e32 v4, v48
+; VI-NEXT:    v_mov_b32_e32 v5, v37
 ; VI-NEXT:    v_mov_b32_e32 v6, v36
-; VI-NEXT:    v_mov_b32_e32 v7, v37
+; VI-NEXT:    v_mov_b32_e32 v7, v39
+; VI-NEXT:    v_mov_b32_e32 v12, v35
+; VI-NEXT:    v_mov_b32_e32 v20, v49
+; VI-NEXT:    v_mov_b32_e32 v28, v51
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v16i16_to_v32i8:
@@ -40353,19 +40362,19 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.h
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[2:3]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v27.h
 ; GFX11-TRUE16-NEXT:  .LBB108_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB108_4
@@ -40527,7 +40536,6 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
 ; GFX11-TRUE16-NEXT:  .LBB108_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v35.l
@@ -43066,10 +43074,10 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
 ; SI-NEXT:    v_readfirstlane_b32 s42, v15
 ; SI-NEXT:    v_readfirstlane_b32 s43, v14
-; SI-NEXT:    v_readfirstlane_b32 s13, v7
-; SI-NEXT:    v_readfirstlane_b32 s15, v6
-; SI-NEXT:    v_readfirstlane_b32 s7, v1
-; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    v_readfirstlane_b32 s40, v7
+; SI-NEXT:    v_readfirstlane_b32 s41, v6
+; SI-NEXT:    v_readfirstlane_b32 s10, v1
+; SI-NEXT:    v_readfirstlane_b32 s9, v0
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v9
@@ -43080,15 +43088,15 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
 ; SI-NEXT:    s_and_b32 s4, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s17, 24
-; SI-NEXT:    s_or_b32 s8, s5, s4
+; SI-NEXT:    s_or_b32 s6, s5, s4
 ; SI-NEXT:    s_and_b32 s4, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s19, 24
-; SI-NEXT:    s_or_b32 s9, s5, s4
+; SI-NEXT:    s_or_b32 s7, s5, s4
 ; SI-NEXT:    s_and_b32 s4, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s21, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_lshl_b32 s10, s4, 16
+; SI-NEXT:    s_lshl_b32 s8, s4, 16
 ; SI-NEXT:    s_and_b32 s4, s22, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s23, 24
@@ -43106,24 +43114,24 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s27, 24
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    s_or_b32 s14, s5, s4
+; SI-NEXT:    s_or_b32 s13, s5, s4
 ; SI-NEXT:    s_and_b32 s4, s28, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s29, 8
 ; SI-NEXT:    v_or_b32_e32 v9, v0, v7
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    s_lshl_b32 s40, s4, 16
-; SI-NEXT:    s_and_b32 s4, s6, 0xff
+; SI-NEXT:    s_lshl_b32 s14, s4, 16
+; SI-NEXT:    s_and_b32 s4, s9, 0xff
 ; SI-NEXT:    v_or_b32_e32 v19, v1, v7
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v10
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s5, s7, 24
+; SI-NEXT:    s_lshl_b32 s5, s10, 24
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v11
-; SI-NEXT:    s_or_b32 s41, s5, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s13, 8
+; SI-NEXT:    s_or_b32 s15, s5, s4
+; SI-NEXT:    s_and_b32 s4, s41, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s40, 8
 ; SI-NEXT:    v_or_b32_e32 v18, v13, v7
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v12
 ; SI-NEXT:    s_or_b32 s4, s4, s5
@@ -43159,11 +43167,11 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v9
-; SI-NEXT:    s_add_i32 s15, s15, 3
+; SI-NEXT:    s_add_i32 s41, s41, 3
 ; SI-NEXT:    v_or_b32_e32 v6, s4, v6
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v7
-; SI-NEXT:    s_and_b32 s4, s15, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s13, 8
+; SI-NEXT:    s_and_b32 s4, s41, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s40, 8
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
@@ -43175,11 +43183,11 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    s_and_b32 s4, s28, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s29, 8
-; SI-NEXT:    s_add_i32 s6, s6, 3
+; SI-NEXT:    s_add_i32 s9, s9, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    s_and_b32 s6, s9, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s7, 24
+; SI-NEXT:    s_lshl_b32 s5, s10, 24
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
@@ -43202,35 +43210,36 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
 ; SI-NEXT:    s_add_i32 s22, s22, 3
 ; SI-NEXT:    s_or_b32 s6, s7, s6
 ; SI-NEXT:    s_and_b32 s8, s22, 0xff
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    s_addk_i32 s6, 0x300
 ; SI-NEXT:    s_lshl_b32 s7, s23, 24
 ; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; SI-NEXT:    s_or_b32 s7, s7, s8
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    s_or_b32 s6, s7, s6
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v4
-; SI-NEXT:    s_or_b32 s6, s7, s6
-; SI-NEXT:    s_and_b32 s7, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s17, 8
+; SI-NEXT:    s_add_i32 s8, s6, 0x3000000
+; SI-NEXT:    s_and_b32 s6, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s17, 8
 ; SI-NEXT:    s_add_i32 s18, s18, 3
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_or_b32 s6, s7, s6
 ; SI-NEXT:    s_and_b32 s9, s18, 0xff
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x300, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_addk_i32 s7, 0x300
-; SI-NEXT:    s_lshl_b32 s8, s19, 24
+; SI-NEXT:    s_addk_i32 s6, 0x300
+; SI-NEXT:    s_lshl_b32 s7, s19, 24
 ; SI-NEXT:    s_lshl_b32 s9, s9, 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v3
-; SI-NEXT:    s_and_b32 s7, s7, 0xffff
-; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s9
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_or_b32 s6, s7, s6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x3000000, v6
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x3000000, v5
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x3000000, v1
@@ -43238,15 +43247,14 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
 ; SI-NEXT:    s_add_i32 s4, s4, 0x3000000
 ; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
 ; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
-; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
-; SI-NEXT:    s_and_b32 s9, s7, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s8, s7, 16
-; SI-NEXT:    s_and_b32 s11, s6, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s10, s6, 16
-; SI-NEXT:    s_and_b32 s14, s5, 0xffff0000
+; SI-NEXT:    s_and_b32 s7, s6, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_and_b32 s11, s8, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_and_b32 s13, s5, 0xffff0000
 ; SI-NEXT:    s_lshl_b32 s12, s5, 16
-; SI-NEXT:    s_and_b32 s41, s4, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s40, s4, 16
+; SI-NEXT:    s_and_b32 s15, s4, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s14, s4, 16
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v1
@@ -43257,14 +43265,14 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v6
 ; SI-NEXT:    s_branch .LBB111_5
 ; SI-NEXT:  .LBB111_3:
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $sgpr10
 ; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr41
+; SI-NEXT:    ; implicit-def: $sgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr44
@@ -43278,14 +43286,14 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v10, s44
 ; SI-NEXT:    v_mov_b32_e32 v14, s45
 ; SI-NEXT:  .LBB111_5: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s8
 ; SI-NEXT:    v_mov_b32_e32 v3, s11
 ; SI-NEXT:    v_mov_b32_e32 v4, s12
-; SI-NEXT:    v_mov_b32_e32 v5, s14
-; SI-NEXT:    v_mov_b32_e32 v6, s40
-; SI-NEXT:    v_mov_b32_e32 v7, s41
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
 ; SI-NEXT:    v_mov_b32_e32 v8, v17
 ; SI-NEXT:    v_mov_b32_e32 v11, v19
 ; SI-NEXT:    v_mov_b32_e32 v12, v18

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 5b82535bf956a..e5a1260fa4538 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -4076,7 +4076,7 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s61, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s63, 0
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s21, 24
@@ -4102,14 +4102,14 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[4:5], s[20:21], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[6:7], s[18:19], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[8:9], s[16:17], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[10:11], s[2:3], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[12:13], s[0:1], 24
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s61
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s63
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB13_3
 ; GFX11-TRUE16-NEXT:  .LBB13_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s1, s1, 3
@@ -4150,12 +4150,12 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 8
 ; GFX11-TRUE16-NEXT:  .LBB13_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s63
-; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s62
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s62
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s61
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s12
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 8
@@ -4291,8 +4291,8 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB13_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr63_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr61_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr59_lo16
@@ -4327,7 +4327,7 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s61, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s63, 0
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s21, 24
@@ -4353,14 +4353,14 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s1, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s63, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s0, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[20:21], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[18:19], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[8:9], s[16:17], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[10:11], s[2:3], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[0:1], 24
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s61
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s63
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB13_3
 ; GFX11-FAKE16-NEXT:  .LBB13_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    s_add_i32 s1, s1, 3
@@ -4401,12 +4401,12 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s1, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s63, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s0, 8
 ; GFX11-FAKE16-NEXT:  .LBB13_3: ; %end
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s63, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s62, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s62, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s61, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s12, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, s7, s9
@@ -4506,8 +4506,8 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
 ; GFX11-FAKE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-FAKE16-NEXT:  .LBB13_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr63
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr61
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr60
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr59
@@ -16547,9 +16547,10 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v21
 ; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v4
 ; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v6
@@ -16562,48 +16563,47 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; kill: killed $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB48_2
@@ -16628,10 +16628,10 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v6, v1, v25
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v19
 ; SI-NEXT:    v_or_b32_e32 v2, v1, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v20
+; SI-NEXT:    v_bfe_u32 v1, v4, 8, 8
 ; SI-NEXT:    v_alignbit_b32 v39, v31, v30, 24
 ; SI-NEXT:    v_alignbit_b32 v48, v31, v30, 16
 ; SI-NEXT:    v_alignbit_b32 v52, v31, v30, 8
@@ -16647,20 +16647,20 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v26, v2, v6, 24
 ; SI-NEXT:    v_alignbit_b32 v27, v2, v6, 16
 ; SI-NEXT:    v_alignbit_b32 v32, v2, v6, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 8, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 8, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 8, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 8, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 8, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 8, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v50, 8, v2
-; SI-NEXT:    v_and_b32_e32 v46, 0xffff, v4
-; SI-NEXT:    v_and_b32_e32 v43, 0xffff, v8
-; SI-NEXT:    v_and_b32_e32 v40, 0xffff, v12
-; SI-NEXT:    v_and_b32_e32 v53, 0xffff, v16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    v_bfe_u32 v47, v4, 8, 8
-; SI-NEXT:    v_bfe_u32 v44, v8, 8, 8
-; SI-NEXT:    v_bfe_u32 v41, v12, 8, 8
-; SI-NEXT:    v_bfe_u32 v54, v16, 8, 8
-; SI-NEXT:    v_bfe_u32 v51, v20, 8, 8
+; SI-NEXT:    v_and_b32_e32 v45, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v42, 0xffff, v12
+; SI-NEXT:    v_and_b32_e32 v55, 0xffff, v16
+; SI-NEXT:    v_and_b32_e32 v51, 0xffff, v20
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_u32 v46, v8, 8, 8
+; SI-NEXT:    v_bfe_u32 v43, v12, 8, 8
+; SI-NEXT:    v_bfe_u32 v40, v16, 8, 8
+; SI-NEXT:    v_bfe_u32 v53, v20, 8, 8
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
@@ -16682,7 +16682,10 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr24
 ; SI-NEXT:  .LBB48_2: ; %Flow
-; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB48_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v13
@@ -16727,7 +16730,6 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, s6, v4
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, s6, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
 ; SI-NEXT:    v_alignbit_b32 v39, v31, v30, 24
 ; SI-NEXT:    v_alignbit_b32 v48, v31, v30, 16
 ; SI-NEXT:    v_alignbit_b32 v52, v31, v30, 8
@@ -16743,23 +16745,22 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v26, v2, v6, 24
 ; SI-NEXT:    v_alignbit_b32 v27, v2, v6, 16
 ; SI-NEXT:    v_alignbit_b32 v32, v2, v6, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 24, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 8, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 24, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 8, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 24, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 8, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 24, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 24, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 8, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 24, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 8, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 24, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 8, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 24, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 24, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v50, 8, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB48_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -16775,11 +16776,13 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v47
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v46
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v47
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v12
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -16799,11 +16802,11 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v44
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v43
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v45
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v46
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -16823,11 +16826,11 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v40
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v43
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -16845,15 +16848,13 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v54
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v54
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v53
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v55
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v40
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -16875,15 +16876,13 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v50
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v51
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v51
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v53
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -16918,27 +16917,40 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
 ; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v1
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr16
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr15
+; VI-NEXT:    ; implicit-def: $vgpr32
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr51
 ; VI-NEXT:    ; implicit-def: $vgpr14
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr48
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr50
 ; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    ; implicit-def: $vgpr13
-; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr12
+; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    ; implicit-def: $vgpr30
-; VI-NEXT:    ; implicit-def: $vgpr32
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    ; implicit-def: $vgpr28
+; VI-NEXT:    ; implicit-def: $vgpr11
+; VI-NEXT:    ; implicit-def: $vgpr42
 ; VI-NEXT:    ; implicit-def: $vgpr27
 ; VI-NEXT:    ; implicit-def: $vgpr29
-; VI-NEXT:    ; implicit-def: $vgpr11
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB48_2
@@ -16947,22 +16959,37 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
 ; VI-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v29, 24, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v27, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v28, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 24, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 24, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 8, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 24, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 24, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 8, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 8, v1
+; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; VI-NEXT:    v_mov_b32_e32 v34, v1
+; VI-NEXT:    v_mov_b32_e32 v32, v2
+; VI-NEXT:    v_mov_b32_e32 v38, v3
+; VI-NEXT:    v_mov_b32_e32 v37, v4
+; VI-NEXT:    v_mov_b32_e32 v50, v5
+; VI-NEXT:    v_mov_b32_e32 v49, v6
+; VI-NEXT:    v_mov_b32_e32 v55, v7
+; VI-NEXT:    v_mov_b32_e32 v53, v8
+; VI-NEXT:    v_mov_b32_e32 v43, v9
+; VI-NEXT:    v_mov_b32_e32 v42, v10
+; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    ; implicit-def: $vgpr5
+; VI-NEXT:    ; implicit-def: $vgpr7
+; VI-NEXT:    ; implicit-def: $vgpr9
 ; VI-NEXT:  .LBB48_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB48_4
@@ -16972,144 +16999,137 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_sdwa v20, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_u16_sdwa v18, v8, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_u16_sdwa v22, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v55, 3, v10
+; VI-NEXT:    v_add_u16_e32 v42, 3, v10
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; VI-NEXT:    v_add_u16_e32 v40, 3, v9
+; VI-NEXT:    v_add_u16_e32 v43, 3, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v20
 ; VI-NEXT:    v_add_u16_sdwa v19, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_u16_sdwa v24, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_u16_e32 v53, 3, v8
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v18
-; VI-NEXT:    v_add_u16_e32 v54, 3, v7
+; VI-NEXT:    v_add_u16_e32 v55, 3, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v22
-; VI-NEXT:    v_or_b32_e32 v10, v55, v10
-; VI-NEXT:    v_or_b32_e32 v9, v40, v9
+; VI-NEXT:    v_or_b32_e32 v10, v42, v10
+; VI-NEXT:    v_or_b32_e32 v9, v43, v9
 ; VI-NEXT:    v_add_u16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_u16_sdwa v26, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_u16_sdwa v21, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_u16_sdwa v25, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v51, 3, v6
+; VI-NEXT:    v_add_u16_e32 v49, 3, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; VI-NEXT:    v_add_u16_e32 v52, 3, v5
+; VI-NEXT:    v_add_u16_e32 v50, 3, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v24
 ; VI-NEXT:    v_or_b32_e32 v8, v53, v8
-; VI-NEXT:    v_or_b32_e32 v7, v54, v7
+; VI-NEXT:    v_or_b32_e32 v7, v55, v7
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; VI-NEXT:    v_add_u16_e32 v49, 3, v4
+; VI-NEXT:    v_add_u16_e32 v37, 3, v4
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v21
-; VI-NEXT:    v_add_u16_e32 v50, 3, v3
+; VI-NEXT:    v_add_u16_e32 v38, 3, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v25
-; VI-NEXT:    v_or_b32_e32 v6, v51, v6
-; VI-NEXT:    v_or_b32_e32 v5, v52, v5
+; VI-NEXT:    v_or_b32_e32 v6, v49, v6
+; VI-NEXT:    v_or_b32_e32 v5, v50, v5
 ; VI-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; VI-NEXT:    v_add_u16_e32 v39, 3, v2
+; VI-NEXT:    v_add_u16_e32 v32, 3, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
-; VI-NEXT:    v_add_u16_e32 v48, 3, v1
+; VI-NEXT:    v_add_u16_e32 v34, 3, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
-; VI-NEXT:    v_or_b32_e32 v4, v49, v4
-; VI-NEXT:    v_or_b32_e32 v3, v50, v3
+; VI-NEXT:    v_or_b32_e32 v4, v37, v4
+; VI-NEXT:    v_or_b32_e32 v3, v38, v3
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; VI-NEXT:    v_or_b32_e32 v2, v39, v2
-; VI-NEXT:    v_or_b32_e32 v1, v48, v1
+; VI-NEXT:    v_or_b32_e32 v2, v32, v2
+; VI-NEXT:    v_or_b32_e32 v1, v34, v1
 ; VI-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v27, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v28, 8, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 8, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v54, 8, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v40, 8, v1
 ; VI-NEXT:    v_bfe_u32 v29, v17, 8, 8
-; VI-NEXT:    v_bfe_u32 v32, v18, 8, 8
-; VI-NEXT:    v_bfe_u32 v35, v19, 8, 8
-; VI-NEXT:    v_mov_b32_e32 v1, v48
-; VI-NEXT:    v_mov_b32_e32 v2, v39
-; VI-NEXT:    v_mov_b32_e32 v3, v50
-; VI-NEXT:    v_mov_b32_e32 v4, v49
-; VI-NEXT:    v_mov_b32_e32 v5, v52
-; VI-NEXT:    v_mov_b32_e32 v6, v51
-; VI-NEXT:    v_mov_b32_e32 v7, v54
-; VI-NEXT:    v_mov_b32_e32 v8, v53
-; VI-NEXT:    v_mov_b32_e32 v9, v40
-; VI-NEXT:    v_mov_b32_e32 v10, v55
-; VI-NEXT:    v_bfe_u32 v39, v21, 8, 8
-; VI-NEXT:    v_bfe_u32 v48, v23, 8, 8
+; VI-NEXT:    v_bfe_u32 v33, v18, 8, 8
+; VI-NEXT:    v_bfe_u32 v39, v19, 8, 8
+; VI-NEXT:    v_bfe_u32 v52, v21, 8, 8
+; VI-NEXT:    v_bfe_u32 v41, v23, 8, 8
 ; VI-NEXT:  .LBB48_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
-; VI-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
-; VI-NEXT:    v_or_b32_sdwa v15, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v40
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v15
+; VI-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v38
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v48
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v54
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v41
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v37
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v51
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v14
-; VI-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v39
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v48
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v52
+; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v13
-; VI-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v33
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v35
-; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v35
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v39
+; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v31
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v12
-; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v30
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v32
-; VI-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v33
+; VI-NEXT:    v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v28
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v11
-; VI-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v29
-; VI-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -20333,8 +20353,8 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
 ; SI-NEXT:    v_readfirstlane_b32 s14, v19
-; SI-NEXT:    v_readfirstlane_b32 s15, v18
-; SI-NEXT:    v_readfirstlane_b32 s11, v11
+; SI-NEXT:    v_readfirstlane_b32 s40, v18
+; SI-NEXT:    v_readfirstlane_b32 s12, v11
 ; SI-NEXT:    v_readfirstlane_b32 s13, v10
 ; SI-NEXT:    v_readfirstlane_b32 s8, v3
 ; SI-NEXT:    v_readfirstlane_b32 s9, v2
@@ -20360,22 +20380,22 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    s_lshl_b32 s10, s23, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s10, s5
-; SI-NEXT:    s_or_b32 s12, s4, s5
+; SI-NEXT:    s_or_b32 s11, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s10, s19, 24
 ; SI-NEXT:    s_or_b32 s4, s10, s4
 ; SI-NEXT:    s_and_b32 s10, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s40, s29, 8
-; SI-NEXT:    s_or_b32 s10, s10, s40
-; SI-NEXT:    s_and_b32 s40, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s40, s40, 16
+; SI-NEXT:    s_lshl_b32 s15, s29, 8
+; SI-NEXT:    s_or_b32 s10, s10, s15
+; SI-NEXT:    s_and_b32 s15, s6, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s15, 16
 ; SI-NEXT:    s_lshl_b32 s41, s7, 24
-; SI-NEXT:    s_or_b32 s43, s41, s40
-; SI-NEXT:    s_and_b32 s40, s26, 0xff
-; SI-NEXT:    s_lshl_b32 s40, s40, 16
+; SI-NEXT:    s_or_b32 s43, s41, s15
+; SI-NEXT:    s_and_b32 s15, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s15, 16
 ; SI-NEXT:    s_lshl_b32 s41, s27, 24
-; SI-NEXT:    s_or_b32 s40, s41, s40
+; SI-NEXT:    s_or_b32 s15, s41, s15
 ; SI-NEXT:    s_and_b32 s41, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s42, s17, 8
 ; SI-NEXT:    s_or_b32 s41, s41, s42
@@ -20392,12 +20412,12 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    v_or_b32_e32 v11, v0, v10
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v5, s40
+; SI-NEXT:    v_mov_b32_e32 v5, s15
 ; SI-NEXT:    v_or_b32_e32 v10, v9, v11
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v4
 ; SI-NEXT:    v_and_b32_e32 v13, 0xff, v14
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v16
-; SI-NEXT:    s_or_b32 s40, s4, s40
+; SI-NEXT:    s_or_b32 s15, s4, s15
 ; SI-NEXT:    s_and_b32 s4, s9, 0xff
 ; SI-NEXT:    s_lshl_b32 s42, s8, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -20414,7 +20434,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v24
 ; SI-NEXT:    v_or_b32_e32 v23, s4, v15
 ; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    s_lshl_b32 s42, s11, 8
+; SI-NEXT:    s_lshl_b32 s42, s12, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v30
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
@@ -20426,7 +20446,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v18, v17, v32
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v20
 ; SI-NEXT:    v_or_b32_e32 v26, s4, v21
-; SI-NEXT:    s_and_b32 s4, s15, 0xff
+; SI-NEXT:    s_and_b32 s4, s40, 0xff
 ; SI-NEXT:    s_lshl_b32 s42, s14, 8
 ; SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
@@ -20434,7 +20454,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    s_or_b32 s10, s10, s43
 ; SI-NEXT:    v_or_b32_e32 v33, v31, v17
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_alignbit_b32 v1, s12, v1, 16
+; SI-NEXT:    v_alignbit_b32 v1, s11, v1, 16
 ; SI-NEXT:    v_alignbit_b32 v5, s10, v5, 16
 ; SI-NEXT:    v_alignbit_b32 v9, v10, v15, 16
 ; SI-NEXT:    v_alignbit_b32 v13, v25, v21, 16
@@ -20447,8 +20467,8 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v32
 ; SI-NEXT:    s_cbranch_execnz .LBB51_3
 ; SI-NEXT:  .LBB51_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s15, s15, 3
-; SI-NEXT:    s_and_b32 s4, s15, 0xff
+; SI-NEXT:    s_add_i32 s40, s40, 3
+; SI-NEXT:    s_and_b32 s4, s40, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s14, 8
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v20
 ; SI-NEXT:    s_or_b32 s4, s5, s4
@@ -20472,7 +20492,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    s_add_i32 s13, s13, 3
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 0x3000000, v1
 ; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s11, 8
+; SI-NEXT:    s_lshl_b32 s5, s12, 8
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v12
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
@@ -20517,7 +20537,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    s_or_b32 s5, s5, s8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s40, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s15, s4, 0x3000000
 ; SI-NEXT:    s_and_b32 s4, s28, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s29, 8
 ; SI-NEXT:    s_add_i32 s6, s6, 3
@@ -20567,24 +20587,24 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x3000000, v0
-; SI-NEXT:    s_add_i32 s12, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s11, s4, 0x3000000
 ; SI-NEXT:    v_mov_b32_e32 v0, s41
-; SI-NEXT:    v_alignbit_b32 v1, s12, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s40
+; SI-NEXT:    v_alignbit_b32 v1, s11, v0, 16
+; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    v_alignbit_b32 v5, s10, v0, 16
 ; SI-NEXT:    v_alignbit_b32 v9, v10, v23, 16
 ; SI-NEXT:    v_alignbit_b32 v13, v25, v26, 16
 ; SI-NEXT:    v_alignbit_b32 v17, v18, v21, 16
-; SI-NEXT:    s_lshr_b32 s42, s12, 16
+; SI-NEXT:    s_lshr_b32 s42, s11, 16
 ; SI-NEXT:    s_lshr_b32 s43, s10, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v25
 ; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
 ; SI-NEXT:  .LBB51_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s41
-; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    v_mov_b32_e32 v2, s11
 ; SI-NEXT:    v_mov_b32_e32 v3, s42
-; SI-NEXT:    v_mov_b32_e32 v4, s40
+; SI-NEXT:    v_mov_b32_e32 v4, s15
 ; SI-NEXT:    v_mov_b32_e32 v6, s10
 ; SI-NEXT:    v_mov_b32_e32 v7, s43
 ; SI-NEXT:    v_mov_b32_e32 v8, v23
@@ -20595,9 +20615,9 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI-NEXT:  .LBB51_4:
 ; SI-NEXT:    ; implicit-def: $sgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $sgpr10
 ; SI-NEXT:    ; implicit-def: $sgpr43
@@ -38783,7 +38803,7 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s61, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s63, 0
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB79_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s21, 24
@@ -38809,14 +38829,14 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[4:5], s[20:21], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[6:7], s[18:19], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[8:9], s[16:17], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[10:11], s[2:3], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[12:13], s[0:1], 24
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s61
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s63
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB79_3
 ; GFX11-TRUE16-NEXT:  .LBB79_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, 3
@@ -38857,12 +38877,12 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 8
 ; GFX11-TRUE16-NEXT:  .LBB79_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s63
-; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s62
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s62
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s61
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s12
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 8
@@ -38998,8 +39018,8 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
 ; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB79_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr63_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr61_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr59_lo16
@@ -39034,7 +39054,7 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s61, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s63, 0
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB79_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s21, 24
@@ -39060,14 +39080,14 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s1, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s63, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s0, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[20:21], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[18:19], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[8:9], s[16:17], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[10:11], s[2:3], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[0:1], 24
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s61
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s63
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB79_3
 ; GFX11-FAKE16-NEXT:  .LBB79_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, 3
@@ -39108,12 +39128,12 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s1, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s63, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s0, 8
 ; GFX11-FAKE16-NEXT:  .LBB79_3: ; %end
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s63, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s62, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s62, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s61, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s12, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, s7, s9
@@ -39213,8 +39233,8 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
 ; GFX11-FAKE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-FAKE16-NEXT:  .LBB79_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr63
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr61
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr60
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr59

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index fbdee6eea327e..7b756bce857bc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -6112,8 +6112,10 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) {
 ; VI-LABEL: bitcast_v2i16_to_v4i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v4, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    ; implicit-def: $vgpr1
 ; VI-NEXT:    ; implicit-def: $vgpr3
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6126,19 +6128,20 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) {
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB56_3: ; %cmp.false
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
+; VI-NEXT:    v_mov_b32_e32 v0, v4
+; VI-NEXT:    ; implicit-def: $vgpr4
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB56_2
 ; VI-NEXT:  .LBB56_4: ; %cmp.true
 ; VI-NEXT:    v_mov_b32_e32 v1, 3
-; VI-NEXT:    v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v4, 3, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; VI-NEXT:    v_or_b32_e32 v0, v4, v0
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; VI-NEXT:    v_add_u16_sdwa v2, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_e32 v0, 3, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; VI-NEXT:    v_or_b32_e32 v1, v0, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    v_bfe_u32 v3, v2, 8, 8
-; VI-NEXT:    v_mov_b32_e32 v0, v4
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6289,30 +6292,31 @@ define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inre
 ; VI-NEXT:    s_cmp_lg_u32 s17, 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB57_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    s_lshr_b32 s7, s16, 24
+; VI-NEXT:    s_lshr_b32 s8, s16, 24
 ; VI-NEXT:    s_lshr_b32 s6, s16, 16
-; VI-NEXT:    s_lshr_b32 s8, s16, 8
+; VI-NEXT:    s_lshr_b32 s9, s16, 8
+; VI-NEXT:    s_mov_b32 s7, s16
 ; VI-NEXT:    s_cbranch_execnz .LBB57_3
 ; VI-NEXT:  .LBB57_2: ; %cmp.true
-; VI-NEXT:    s_lshr_b32 s6, s16, 16
-; VI-NEXT:    s_add_i32 s4, s16, 3
-; VI-NEXT:    s_add_i32 s6, s6, 3
-; VI-NEXT:    s_and_b32 s5, s4, 0xffff
-; VI-NEXT:    s_lshl_b32 s7, s6, 16
-; VI-NEXT:    s_or_b32 s5, s5, s7
-; VI-NEXT:    s_mov_b32 s16, s4
-; VI-NEXT:    s_lshr_b32 s8, s5, 8
-; VI-NEXT:    s_bfe_u32 s7, s6, 0x80008
+; VI-NEXT:    s_lshr_b32 s5, s16, 16
+; VI-NEXT:    s_add_i32 s7, s16, 3
+; VI-NEXT:    s_add_i32 s6, s5, 3
+; VI-NEXT:    s_and_b32 s4, s7, 0xffff
+; VI-NEXT:    s_lshl_b32 s5, s6, 16
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_lshr_b32 s9, s4, 8
+; VI-NEXT:    s_bfe_u32 s8, s6, 0x80008
 ; VI-NEXT:  .LBB57_3: ; %end
-; VI-NEXT:    v_mov_b32_e32 v0, s16
-; VI-NEXT:    v_mov_b32_e32 v1, s8
+; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    v_mov_b32_e32 v1, s9
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_mov_b32_e32 v3, s8
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB57_4:
-; VI-NEXT:    ; implicit-def: $sgpr8
-; VI-NEXT:    ; implicit-def: $sgpr6
 ; VI-NEXT:    ; implicit-def: $sgpr7
+; VI-NEXT:    ; implicit-def: $sgpr9
+; VI-NEXT:    ; implicit-def: $sgpr6
+; VI-NEXT:    ; implicit-def: $sgpr8
 ; VI-NEXT:    s_branch .LBB57_2
 ;
 ; GFX9-LABEL: bitcast_v2i16_to_v4i8_scalar:
@@ -9170,8 +9174,8 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v4.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v4.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
@@ -9200,7 +9204,6 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB76_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 68498462f489e..1024c2a7f066a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -64649,10 +64649,11 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
@@ -64727,10 +64728,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v10
@@ -64746,8 +64746,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v30
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; kill: killed $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr50
@@ -64758,10 +64758,11 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr61
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; kill: killed $vgpr2
@@ -64770,8 +64771,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; kill: killed $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -64784,204 +64785,207 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v56, v1, v63
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v47, v1, v62
+; SI-NEXT:    v_or_b32_e32 v47, v1, v27
 ; SI-NEXT:    v_alignbit_b32 v1, v47, v56, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v47, v56, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v47, v56, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v5
 ; SI-NEXT:    v_or_b32_e32 v50, v1, v37
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v38, v1, v36
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v50, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v50, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v50, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v9
 ; SI-NEXT:    v_or_b32_e32 v34, v1, v48
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
 ; SI-NEXT:    v_or_b32_e32 v33, v1, v39
 ; SI-NEXT:    v_alignbit_b32 v1, v33, v34, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v33, v34, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v33, v34, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v13
 ; SI-NEXT:    v_or_b32_e32 v32, v1, v51
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
 ; SI-NEXT:    v_or_b32_e32 v31, v1, v49
 ; SI-NEXT:    v_alignbit_b32 v1, v31, v32, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v31, v32, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v31, v32, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v17
 ; SI-NEXT:    v_or_b32_e32 v30, v1, v53
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v19
 ; SI-NEXT:    v_or_b32_e32 v26, v1, v52
 ; SI-NEXT:    v_alignbit_b32 v1, v26, v30, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v26, v30, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v26, v30, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v21
 ; SI-NEXT:    v_or_b32_e32 v22, v1, v55
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v23
 ; SI-NEXT:    v_or_b32_e32 v18, v1, v54
-; SI-NEXT:    v_alignbit_b32 v1, v18, v22, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v18, v22, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v25
 ; SI-NEXT:    v_or_b32_e32 v14, v1, v41
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v60, v18, v22, 24
+; SI-NEXT:    v_alignbit_b32 v61, v18, v22, 16
+; SI-NEXT:    v_bfe_u32 v62, v44, 8, 8
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v10, v1, v40
 ; SI-NEXT:    v_alignbit_b32 v1, v10, v14, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v29
 ; SI-NEXT:    v_or_b32_e32 v6, v1, v43
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v35
 ; SI-NEXT:    v_or_b32_e32 v2, v1, v42
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v47
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v33
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v31
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v20
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v28
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v44
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v4, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v8, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v12, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v16, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v20, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v24, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_bfe_u32 v1, v28, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v26
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; SI-NEXT:    v_alignbit_b32 v60, v18, v22, 24
 ; SI-NEXT:    v_alignbit_b32 v57, v10, v14, 24
-; SI-NEXT:    v_alignbit_b32 v61, v10, v14, 16
+; SI-NEXT:    v_alignbit_b32 v58, v10, v14, 16
 ; SI-NEXT:    v_alignbit_b32 v45, v2, v6, 24
 ; SI-NEXT:    v_alignbit_b32 v46, v2, v6, 16
 ; SI-NEXT:    v_alignbit_b32 v59, v2, v6, 8
-; SI-NEXT:    v_bfe_u32 v58, v44, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:  .LBB96_2: ; %Flow
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v20, v61
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; SI-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB96_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
@@ -64991,7 +64995,35 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_mov_b32 s6, 0x30000
 ; SI-NEXT:    v_or_b32_e32 v4, v41, v4
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, s6, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v27
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v63, v1
+; SI-NEXT:    v_add_i32_e32 v56, vcc, s6, v1
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v27, v1
+; SI-NEXT:    v_add_i32_e32 v47, vcc, s6, v1
+; SI-NEXT:    v_alignbit_b32 v1, v47, v56, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v1, v47, v56, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v29
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v2, v43, v2
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x30000, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v35
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v2, v42, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
+; SI-NEXT:    v_alignbit_b32 v45, v2, v6, 24
+; SI-NEXT:    v_alignbit_b32 v46, v2, v6, 16
+; SI-NEXT:    v_alignbit_b32 v59, v2, v6, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 24, v2
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v40, v4
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, s6, v4
@@ -65025,157 +65057,121 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, s6, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v11
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v39, v4
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_add_i32_e32 v33, vcc, s6, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v5
-; SI-NEXT:    v_or_b32_e32 v1, v63, v1
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_add_i32_e32 v56, vcc, s6, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v37, v4
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_add_i32_e32 v50, vcc, s6, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v7
-; SI-NEXT:    v_or_b32_e32 v1, v62, v1
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_add_i32_e32 v47, vcc, s6, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v36, v4
-; SI-NEXT:    v_alignbit_b32 v1, v47, v56, 16
 ; SI-NEXT:    v_add_i32_e32 v38, vcc, s6, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v47, v56, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v50, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v50, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v38, v50, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v33, v34, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v33, v34, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v33, v34, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v31, v32, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v31, v32, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v31, v32, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v26, v30, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v26, v30, 8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v18, v22, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v31, v32, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v47
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v47
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v47
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v38
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v38
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v33
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v33
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v33
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v31
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v31
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v26
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v29
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v26
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v18
-; SI-NEXT:    v_or_b32_e32 v2, v43, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x30000, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v35
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v10
-; SI-NEXT:    v_or_b32_e32 v2, v42, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; SI-NEXT:    v_alignbit_b32 v4, v47, v56, 24
-; SI-NEXT:    v_alignbit_b32 v44, v26, v30, 24
+; SI-NEXT:    v_alignbit_b32 v24, v31, v32, 24
+; SI-NEXT:    v_alignbit_b32 v28, v31, v32, 16
+; SI-NEXT:    v_alignbit_b32 v12, v26, v30, 24
+; SI-NEXT:    v_alignbit_b32 v16, v26, v30, 16
+; SI-NEXT:    v_alignbit_b32 v44, v26, v30, 8
 ; SI-NEXT:    v_alignbit_b32 v60, v18, v22, 24
 ; SI-NEXT:    v_alignbit_b32 v61, v18, v22, 16
+; SI-NEXT:    v_alignbit_b32 v20, v18, v22, 8
 ; SI-NEXT:    v_alignbit_b32 v57, v10, v14, 24
-; SI-NEXT:    v_alignbit_b32 v20, v10, v14, 16
-; SI-NEXT:    v_alignbit_b32 v28, v10, v14, 8
-; SI-NEXT:    v_alignbit_b32 v45, v2, v6, 24
-; SI-NEXT:    v_alignbit_b32 v46, v2, v6, 16
-; SI-NEXT:    v_alignbit_b32 v59, v2, v6, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 24, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v58, v10, v14, 16
+; SI-NEXT:    v_alignbit_b32 v8, v10, v14, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB96_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v56
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -65183,7 +65179,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
@@ -65191,14 +65187,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v47
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -65209,14 +65205,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v50
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
@@ -65227,14 +65223,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v38
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -65245,14 +65241,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
@@ -65263,14 +65259,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v33
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -65281,32 +65277,28 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v24
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v28
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v31
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -65317,30 +65309,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v44
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v44
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v12
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v26
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
@@ -65351,28 +65339,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v60
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v20
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v61
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v60
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
@@ -65385,9 +65371,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v20
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v57
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -65395,14 +65381,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v10
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
@@ -65427,13 +65413,13 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v58
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v62
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
@@ -65462,27 +65448,28 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ;
 ; VI-LABEL: bitcast_v32i16_to_v64i8:
 ; VI:       ; %bb.0:
+; VI-NEXT:    ; implicit-def: $vgpr19
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
 ; VI-NEXT:    ; implicit-def: $vgpr17
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
+; VI-NEXT:    ; kill: killed $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    ; kill: killed $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; kill: killed $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    ; kill: killed $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; kill: killed $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    ; kill: killed $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; kill: killed $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr19
 ; VI-NEXT:    ; kill: killed $vgpr17
 ; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; kill: killed $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr19
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -65499,312 +65486,415 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr26
-; VI-NEXT:    ; implicit-def: $vgpr23
-; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; kill: killed $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; kill: killed $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v1
+; VI-NEXT:    ; kill: killed $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; kill: killed $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr42
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr18
 ; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr62
-; VI-NEXT:    ; implicit-def: $vgpr22
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr51
 ; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr58
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr60
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr63
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; kill: killed $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr17
 ; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr58
 ; VI-NEXT:    ; implicit-def: $vgpr56
-; VI-NEXT:    ; kill: killed $vgpr17
-; VI-NEXT:    ; implicit-def: $vgpr28
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr27
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr62
+; VI-NEXT:    ; kill: killed $vgpr19
 ; VI-NEXT:    ; implicit-def: $vgpr25
+; VI-NEXT:    ; implicit-def: $vgpr24
+; VI-NEXT:    ; implicit-def: $vgpr23
+; VI-NEXT:    ; implicit-def: $vgpr22
 ; VI-NEXT:    ; implicit-def: $vgpr21
 ; VI-NEXT:    ; implicit-def: $vgpr20
 ; VI-NEXT:    ; implicit-def: $vgpr19
-; VI-NEXT:    ; implicit-def: $vgpr18
-; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr42
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB96_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v16
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v15
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v14
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v14
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v13
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; VI-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v12
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v12
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v11
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v10
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v8
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v8
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v7
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v6
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[15:16]
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v5
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
 ; VI-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
 ; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT:    v_lshrrev_b64 v[25:26], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[3:4]
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 24, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v40, 8, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v56, 24, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v57, 24, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v44, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v58, 24, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v45, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 24, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v59, 8, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v63, 24, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v53, 8, v2
-; VI-NEXT:    v_lshrrev_b64 v[23:24], 24, v[1:2]
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 8, v1
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; VI-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 24, v16
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v45, 8, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 8, v1
+; VI-NEXT:    v_mov_b32_e32 v25, v50
+; VI-NEXT:    v_mov_b32_e32 v41, v1
+; VI-NEXT:    v_mov_b32_e32 v54, v2
+; VI-NEXT:    v_mov_b32_e32 v57, v3
+; VI-NEXT:    v_mov_b32_e32 v47, v4
+; VI-NEXT:    v_mov_b32_e32 v61, v5
+; VI-NEXT:    v_mov_b32_e32 v60, v6
+; VI-NEXT:    v_mov_b32_e32 v52, v7
+; VI-NEXT:    v_mov_b32_e32 v63, v8
+; VI-NEXT:    v_mov_b32_e32 v40, v9
+; VI-NEXT:    v_mov_b32_e32 v53, v10
+; VI-NEXT:    v_mov_b32_e32 v17, v11
+; VI-NEXT:    v_mov_b32_e32 v44, v12
+; VI-NEXT:    v_mov_b32_e32 v58, v13
+; VI-NEXT:    v_mov_b32_e32 v56, v14
+; VI-NEXT:    v_mov_b32_e32 v50, v15
+; VI-NEXT:    v_mov_b32_e32 v62, v16
+; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    ; implicit-def: $vgpr5
+; VI-NEXT:    ; implicit-def: $vgpr7
+; VI-NEXT:    ; implicit-def: $vgpr9
+; VI-NEXT:    ; implicit-def: $vgpr11
+; VI-NEXT:    ; implicit-def: $vgpr13
+; VI-NEXT:    ; implicit-def: $vgpr15
 ; VI-NEXT:  .LBB96_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB96_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v17, 3
-; VI-NEXT:    v_add_u16_sdwa v50, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v53, 3, v2
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v50
-; VI-NEXT:    v_add_u16_sdwa v52, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v28, v53, v2
-; VI-NEXT:    v_add_u16_e32 v2, 3, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v52
-; VI-NEXT:    v_add_u16_sdwa v48, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v27, v2, v1
-; VI-NEXT:    v_add_u16_e32 v54, 3, v4
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v48
-; VI-NEXT:    v_add_u16_sdwa v51, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v24, v54, v1
-; VI-NEXT:    v_add_u16_e32 v4, 3, v3
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v51
-; VI-NEXT:    v_add_u16_sdwa v37, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v23, v4, v1
-; VI-NEXT:    v_add_u16_e32 v56, 3, v6
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v37
-; VI-NEXT:    v_add_u16_sdwa v49, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v26, v56, v1
-; VI-NEXT:    v_add_u16_e32 v6, 3, v5
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v49
-; VI-NEXT:    v_add_u16_sdwa v35, v8, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v25, v6, v1
-; VI-NEXT:    v_add_u16_e32 v57, 3, v8
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v35
-; VI-NEXT:    v_add_u16_sdwa v39, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v47, v57, v1
-; VI-NEXT:    v_add_u16_e32 v8, 3, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v39
-; VI-NEXT:    v_add_u16_sdwa v33, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v46, v8, v1
-; VI-NEXT:    v_add_u16_e32 v58, 3, v10
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v33
-; VI-NEXT:    v_add_u16_sdwa v38, v9, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v10, v58, v1
-; VI-NEXT:    v_add_u16_e32 v59, 3, v9
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v38
-; VI-NEXT:    v_add_u16_sdwa v31, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v9, v59, v1
-; VI-NEXT:    v_add_u16_e32 v60, 3, v12
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v31
-; VI-NEXT:    v_add_u16_sdwa v36, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v12, v60, v1
-; VI-NEXT:    v_add_u16_e32 v61, 3, v11
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v36
-; VI-NEXT:    v_add_u16_sdwa v30, v14, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v11, v61, v1
-; VI-NEXT:    v_add_u16_e32 v62, 3, v14
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
-; VI-NEXT:    v_add_u16_sdwa v34, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v14, v62, v1
-; VI-NEXT:    v_add_u16_e32 v63, 3, v13
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
-; VI-NEXT:    v_add_u16_sdwa v29, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v13, v63, v1
-; VI-NEXT:    v_add_u16_e32 v55, 3, v16
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
-; VI-NEXT:    v_add_u16_sdwa v32, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v16, v55, v1
-; VI-NEXT:    v_add_u16_e32 v40, 3, v15
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
-; VI-NEXT:    v_or_b32_e32 v15, v40, v1
-; VI-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; VI-NEXT:    v_mov_b32_e32 v18, 3
+; VI-NEXT:    v_add_u16_sdwa v26, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v29, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_e32 v62, 3, v16
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; VI-NEXT:    v_add_u16_e32 v50, 3, v15
+; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v29
+; VI-NEXT:    v_or_b32_e32 v16, v62, v16
+; VI-NEXT:    v_or_b32_e32 v15, v50, v15
+; VI-NEXT:    v_add_u16_sdwa v38, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v49, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v36, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v48, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v34, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v39, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v32, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v37, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v30, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v35, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v28, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v33, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v27, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v31, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b64 v[18:19], 24, v[15:16]
+; VI-NEXT:    v_add_u16_e32 v56, 3, v14
+; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v27
+; VI-NEXT:    v_add_u16_e32 v58, 3, v13
+; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
+; VI-NEXT:    v_or_b32_e32 v14, v56, v14
+; VI-NEXT:    v_or_b32_e32 v13, v58, v13
+; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; VI-NEXT:    v_add_u16_e32 v44, 3, v12
+; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v28
+; VI-NEXT:    v_add_u16_e32 v17, 3, v11
+; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v33
+; VI-NEXT:    v_add_u16_e32 v53, 3, v10
+; VI-NEXT:    v_lshlrev_b32_e32 v10, 16, v30
+; VI-NEXT:    v_add_u16_e32 v40, 3, v9
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v35
+; VI-NEXT:    v_or_b32_e32 v12, v44, v12
+; VI-NEXT:    v_or_b32_e32 v11, v17, v11
+; VI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    v_add_u16_e32 v63, 3, v8
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v32
+; VI-NEXT:    v_add_u16_e32 v52, 3, v7
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v37
+; VI-NEXT:    v_or_b32_e32 v10, v53, v10
+; VI-NEXT:    v_or_b32_e32 v9, v40, v9
 ; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; VI-NEXT:    v_add_u16_e32 v60, 3, v6
+; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v34
+; VI-NEXT:    v_add_u16_e32 v61, 3, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v39
+; VI-NEXT:    v_or_b32_e32 v8, v63, v8
+; VI-NEXT:    v_or_b32_e32 v7, v52, v7
 ; VI-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[46:47]
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v14
-; VI-NEXT:    v_mov_b32_e32 v14, v62
-; VI-NEXT:    v_mov_b32_e32 v15, v40
-; VI-NEXT:    v_mov_b32_e32 v40, v16
-; VI-NEXT:    v_mov_b32_e32 v16, v55
-; VI-NEXT:    v_mov_b32_e32 v55, v22
-; VI-NEXT:    v_lshrrev_b32_e32 v62, 8, v23
-; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
-; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v44, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v45, 8, v47
-; VI-NEXT:    v_lshrrev_b32_e32 v47, 8, v26
-; VI-NEXT:    v_mov_b32_e32 v9, v59
-; VI-NEXT:    v_mov_b32_e32 v11, v61
-; VI-NEXT:    v_lshrrev_b32_e32 v59, 8, v25
-; VI-NEXT:    v_lshrrev_b64 v[25:26], 24, v[25:26]
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v24
-; VI-NEXT:    v_lshrrev_b64 v[23:24], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v12
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v46
-; VI-NEXT:    v_mov_b32_e32 v1, v2
-; VI-NEXT:    v_mov_b32_e32 v2, v53
-; VI-NEXT:    v_mov_b32_e32 v3, v4
-; VI-NEXT:    v_mov_b32_e32 v4, v54
-; VI-NEXT:    v_mov_b32_e32 v5, v6
-; VI-NEXT:    v_mov_b32_e32 v6, v56
-; VI-NEXT:    v_mov_b32_e32 v7, v8
-; VI-NEXT:    v_mov_b32_e32 v8, v57
-; VI-NEXT:    v_mov_b32_e32 v10, v58
-; VI-NEXT:    v_mov_b32_e32 v12, v60
-; VI-NEXT:    v_mov_b32_e32 v13, v63
-; VI-NEXT:    v_lshrrev_b32_e32 v53, 8, v28
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 8, v27
-; VI-NEXT:    v_bfe_u32 v27, v29, 8, 8
-; VI-NEXT:    v_bfe_u32 v28, v30, 8, 8
-; VI-NEXT:    v_bfe_u32 v56, v31, 8, 8
-; VI-NEXT:    v_bfe_u32 v57, v33, 8, 8
-; VI-NEXT:    v_bfe_u32 v58, v35, 8, 8
-; VI-NEXT:    v_bfe_u32 v60, v37, 8, 8
-; VI-NEXT:    v_bfe_u32 v63, v48, 8, 8
-; VI-NEXT:    v_bfe_u32 v54, v50, 8, 8
+; VI-NEXT:    v_add_u16_e32 v47, 3, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v36
+; VI-NEXT:    v_add_u16_e32 v57, 3, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
+; VI-NEXT:    v_or_b32_e32 v6, v60, v6
+; VI-NEXT:    v_or_b32_e32 v5, v61, v5
+; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; VI-NEXT:    v_add_u16_e32 v54, 3, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
+; VI-NEXT:    v_add_u16_e32 v41, 3, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v49
+; VI-NEXT:    v_or_b32_e32 v4, v47, v4
+; VI-NEXT:    v_or_b32_e32 v3, v57, v3
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; VI-NEXT:    v_or_b32_e32 v2, v54, v2
+; VI-NEXT:    v_or_b32_e32 v1, v41, v1
+; VI-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 8, v1
+; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; VI-NEXT:    v_bfe_u32 v1, v27, 8, 8
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v28, 8, 8
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v6
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v30, 8, 8
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v45, 8, v2
+; VI-NEXT:    v_bfe_u32 v25, v26, 8, 8
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v1, v32, 8, 8
+; VI-NEXT:    v_bfe_u32 v43, v34, 8, 8
+; VI-NEXT:    v_bfe_u32 v46, v36, 8, 8
+; VI-NEXT:    v_bfe_u32 v59, v38, 8, 8
+; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB96_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v26
-; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v23
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v23, v52, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v53
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v18
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v24
+; VI-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v54
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v45
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v59
+; VI-NEXT:    v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v22
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v62
-; VI-NEXT:    v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v51
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v23
+; VI-NEXT:    v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v61
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v63
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v55
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v46
+; VI-NEXT:    v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v59
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v25
-; VI-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v22
+; VI-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v47
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v60
-; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v43
+; VI-NEXT:    v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v46
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v21
-; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v45
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v58
-; VI-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v44
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v42
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v20
-; VI-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v43
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v57
-; VI-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v42
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v19
-; VI-NEXT:    v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v41
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v56
-; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
-; VI-NEXT:    v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v28
-; VI-NEXT:    v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 52, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v40
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v17
-; VI-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 56, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v55
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v27
-; VI-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v25
+; VI-NEXT:    v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -71551,7 +71641,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_readfirstlane_b32 s14, v27
+; SI-NEXT:    v_readfirstlane_b32 s15, v27
 ; SI-NEXT:    v_readfirstlane_b32 s40, v26
 ; SI-NEXT:    v_readfirstlane_b32 s12, v19
 ; SI-NEXT:    v_readfirstlane_b32 s13, v18
@@ -71600,17 +71690,17 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_and_b32 s5, s22, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s15, s23, 24
+; SI-NEXT:    s_lshl_b32 s14, s23, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s15, s5
+; SI-NEXT:    s_or_b32 s5, s14, s5
 ; SI-NEXT:    s_or_b32 s41, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s15, s19, 24
-; SI-NEXT:    s_or_b32 s4, s15, s4
-; SI-NEXT:    s_and_b32 s15, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s14, s19, 24
+; SI-NEXT:    s_or_b32 s4, s14, s4
+; SI-NEXT:    s_and_b32 s14, s28, 0xff
 ; SI-NEXT:    s_lshl_b32 s46, s29, 8
-; SI-NEXT:    s_or_b32 s15, s15, s46
+; SI-NEXT:    s_or_b32 s14, s14, s46
 ; SI-NEXT:    s_and_b32 s46, s6, 0xff
 ; SI-NEXT:    s_lshl_b32 s46, s46, 16
 ; SI-NEXT:    s_lshl_b32 s47, s7, 24
@@ -71699,7 +71789,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v63, v59, v34
 ; SI-NEXT:    v_or_b32_e32 v39, s4, v25
 ; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s56, s14, 8
+; SI-NEXT:    s_lshl_b32 s56, s15, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; SI-NEXT:    v_or_b32_e32 v48, v32, v63
 ; SI-NEXT:    v_and_b32_e32 v32, 0xff, v57
@@ -71722,12 +71812,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_or_b32_e32 v33, s4, v33
 ; SI-NEXT:    s_and_b32 s4, s45, 0xff
 ; SI-NEXT:    s_lshl_b32 s56, s44, 8
-; SI-NEXT:    s_and_b32 s15, s15, 0xffff
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    s_or_b32 s4, s4, s56
-; SI-NEXT:    s_or_b32 s15, s15, s57
+; SI-NEXT:    s_or_b32 s14, s14, s57
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    v_alignbit_b32 v1, s41, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, s15, v5, 16
+; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
 ; SI-NEXT:    v_alignbit_b32 v9, v10, v15, 16
 ; SI-NEXT:    v_alignbit_b32 v13, v36, v23, 16
 ; SI-NEXT:    v_alignbit_b32 v21, v38, v29, 16
@@ -71793,7 +71883,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_add_i32 s40, s40, 3
 ; SI-NEXT:    v_add_i32_e32 v26, vcc, 0x3000000, v1
 ; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s14, 8
+; SI-NEXT:    s_lshl_b32 s5, s15, 8
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v28
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
@@ -71897,7 +71987,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s15, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s14, s4, 0x3000000
 ; SI-NEXT:    s_and_b32 s4, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 8
 ; SI-NEXT:    s_add_i32 s18, s18, 3
@@ -71938,7 +72028,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v0, s47
 ; SI-NEXT:    v_alignbit_b32 v1, s41, v0, 16
 ; SI-NEXT:    v_mov_b32_e32 v0, s46
-; SI-NEXT:    v_alignbit_b32 v5, s15, v0, 16
+; SI-NEXT:    v_alignbit_b32 v5, s14, v0, 16
 ; SI-NEXT:    v_alignbit_b32 v9, v10, v35, 16
 ; SI-NEXT:    v_alignbit_b32 v13, v36, v37, 16
 ; SI-NEXT:    v_alignbit_b32 v17, v18, v39, 16
@@ -71946,7 +72036,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_alignbit_b32 v25, v26, v33, 16
 ; SI-NEXT:    v_alignbit_b32 v29, v48, v34, 16
 ; SI-NEXT:    s_lshr_b32 s56, s41, 16
-; SI-NEXT:    s_lshr_b32 s57, s15, 16
+; SI-NEXT:    s_lshr_b32 s57, s14, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v36
 ; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
@@ -71975,7 +72065,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v2, s41
 ; SI-NEXT:    v_mov_b32_e32 v3, s56
 ; SI-NEXT:    v_mov_b32_e32 v4, s46
-; SI-NEXT:    v_mov_b32_e32 v6, s15
+; SI-NEXT:    v_mov_b32_e32 v6, s14
 ; SI-NEXT:    v_mov_b32_e32 v7, s57
 ; SI-NEXT:    v_mov_b32_e32 v8, v35
 ; SI-NEXT:    v_mov_b32_e32 v12, v37
@@ -71995,7 +72085,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    ; implicit-def: $sgpr56
 ; SI-NEXT:    ; implicit-def: $sgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr9
@@ -72102,11 +72192,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v39
+; VI-NEXT:    v_lshlrev_b32_e32 v38, 8, v39
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v51, 8, v48
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b32_e32 v38, 8, v49
+; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v49
 ; VI-NEXT:    s_cbranch_scc0 .LBB99_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -72165,10 +72255,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v45, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v57, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
@@ -72228,9 +72318,9 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v24
 ; VI-NEXT:    v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v23
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v45
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v57
 ; VI-NEXT:    v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x300, v3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v55
 ; VI-NEXT:    v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -72287,8 +72377,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    s_addk_i32 s6, 0x300
 ; VI-NEXT:    s_addk_i32 s8, 0x300
 ; VI-NEXT:    s_addk_i32 s10, 0x300
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v57
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v45
 ; VI-NEXT:    s_addk_i32 s4, 0x300
 ; VI-NEXT:    s_lshl_b32 s5, s5, 16
 ; VI-NEXT:    s_lshl_b32 s7, s7, 16
@@ -72296,8 +72386,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; VI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; VI-NEXT:    s_and_b32 s6, s6, 0xffff
-; VI-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x300, v1
+; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    s_or_b32 s9, s9, s10
 ; VI-NEXT:    s_or_b32 s7, s7, s8
 ; VI-NEXT:    s_or_b32 s5, s5, s6
@@ -80663,6 +80753,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v52, s44
+; VI-NEXT:    v_mov_b32_e32 v19, s67
 ; VI-NEXT:    v_mov_b32_e32 v12, s66
 ; VI-NEXT:    v_mov_b32_e32 v20, s65
 ; VI-NEXT:    v_mov_b32_e32 v13, s64
@@ -80721,7 +80812,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    v_mov_b32_e32 v4, s14
 ; VI-NEXT:    v_mov_b32_e32 v3, s40
 ; VI-NEXT:    v_mov_b32_e32 v9, s75
-; VI-NEXT:    v_mov_b32_e32 v19, s67
 ; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v52, s62
@@ -85085,11 +85175,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v39
+; VI-NEXT:    v_lshlrev_b32_e32 v38, 8, v39
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v51, 8, v48
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b32_e32 v38, 8, v49
+; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v49
 ; VI-NEXT:    s_cbranch_scc0 .LBB107_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -85148,10 +85238,10 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v45, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v57, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
@@ -85211,9 +85301,9 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v24
 ; VI-NEXT:    v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v23
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v45
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v57
 ; VI-NEXT:    v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x300, v3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v55
 ; VI-NEXT:    v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -85270,8 +85360,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    s_addk_i32 s6, 0x300
 ; VI-NEXT:    s_addk_i32 s8, 0x300
 ; VI-NEXT:    s_addk_i32 s10, 0x300
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v57
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v45
 ; VI-NEXT:    s_addk_i32 s4, 0x300
 ; VI-NEXT:    s_lshl_b32 s5, s5, 16
 ; VI-NEXT:    s_lshl_b32 s7, s7, 16
@@ -85279,8 +85369,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; VI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; VI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; VI-NEXT:    s_and_b32 s6, s6, 0xffff
-; VI-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x300, v1
+; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    s_or_b32 s9, s9, s10
 ; VI-NEXT:    s_or_b32 s7, s7, s8
 ; VI-NEXT:    s_or_b32 s5, s5, s6
@@ -87919,8 +88009,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr17
 ; GFX9-NEXT:    ; kill: killed $vgpr17
 ; GFX9-NEXT:    ; implicit-def: $vgpr17
-; GFX9-NEXT:    ; kill: killed $vgpr17
-; GFX9-NEXT:    ; implicit-def: $vgpr17
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -87937,46 +88025,47 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    ; implicit-def: $vgpr23
-; GFX9-NEXT:    ; implicit-def: $vgpr24
-; GFX9-NEXT:    ; implicit-def: $vgpr26
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    ; implicit-def: $vgpr59
-; GFX9-NEXT:    ; implicit-def: $vgpr52
-; GFX9-NEXT:    ; implicit-def: $vgpr30
+; GFX9-NEXT:    ; implicit-def: $vgpr27
+; GFX9-NEXT:    ; implicit-def: $vgpr28
+; GFX9-NEXT:    ; implicit-def: $vgpr23
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
-; GFX9-NEXT:    ; implicit-def: $vgpr25
-; GFX9-NEXT:    ; implicit-def: $vgpr31
-; GFX9-NEXT:    ; implicit-def: $vgpr46
+; GFX9-NEXT:    ; implicit-def: $vgpr58
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    ; implicit-def: $vgpr32
+; GFX9-NEXT:    ; implicit-def: $vgpr29
+; GFX9-NEXT:    ; implicit-def: $vgpr39
+; GFX9-NEXT:    ; implicit-def: $vgpr22
+; GFX9-NEXT:    ; implicit-def: $vgpr30
+; GFX9-NEXT:    ; implicit-def: $vgpr45
 ; GFX9-NEXT:    ; implicit-def: $vgpr63
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr43
+; GFX9-NEXT:    ; implicit-def: $vgpr31
 ; GFX9-NEXT:    ; implicit-def: $vgpr62
-; GFX9-NEXT:    ; implicit-def: $vgpr34
+; GFX9-NEXT:    ; implicit-def: $vgpr32
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    ; implicit-def: $vgpr61
-; GFX9-NEXT:    ; implicit-def: $vgpr35
-; GFX9-NEXT:    ; implicit-def: $vgpr54
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr36
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr37
+; GFX9-NEXT:    ; implicit-def: $vgpr34
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    ; implicit-def: $vgpr59
+; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr57
-; GFX9-NEXT:    ; implicit-def: $vgpr38
+; GFX9-NEXT:    ; implicit-def: $vgpr36
+; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr56
-; GFX9-NEXT:    ; implicit-def: $vgpr39
+; GFX9-NEXT:    ; implicit-def: $vgpr37
 ; GFX9-NEXT:    ; implicit-def: $vgpr47
+; GFX9-NEXT:    ; implicit-def: $vgpr38
+; GFX9-NEXT:    ; implicit-def: $vgpr46
 ; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr45
-; GFX9-NEXT:    ; implicit-def: $vgpr49
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    ; implicit-def: $vgpr42
+; GFX9-NEXT:    ; implicit-def: $vgpr49
+; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; kill: killed $vgpr17
 ; GFX9-NEXT:    ; implicit-def: $vgpr55
-; GFX9-NEXT:    ; implicit-def: $vgpr28
+; GFX9-NEXT:    ; kill: killed $vgpr17
+; GFX9-NEXT:    ; implicit-def: $vgpr54
+; GFX9-NEXT:    ; implicit-def: $vgpr25
 ; GFX9-NEXT:    ; implicit-def: $vgpr21
 ; GFX9-NEXT:    ; implicit-def: $vgpr20
 ; GFX9-NEXT:    ; implicit-def: $vgpr19
@@ -87992,52 +88081,51 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
 ; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
-; GFX9-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
 ; GFX9-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
 ; GFX9-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
 ; GFX9-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX9-NEXT:    v_lshrrev_b64 v[25:26], 24, v[3:4]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 24, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 16, v15
-; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v15
-; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 24, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX9-NEXT:    v_lshrrev_b64 v[22:23], 24, v[3:4]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 24, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 24, v14
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v14
-; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 16, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 24, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 8, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 24, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 24, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 8, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 24, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v46, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 8, v1
-; GFX9-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX9-NEXT:    v_lshrrev_b64 v[28:29], 24, v[5:6]
-; GFX9-NEXT:    v_lshrrev_b64 v[26:27], 24, v[1:2]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v46, 24, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 24, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 24, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 8, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 24, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 8, v1
+; GFX9-NEXT:    v_lshrrev_b64 v[25:26], 24, v[5:6]
+; GFX9-NEXT:    v_lshrrev_b64 v[23:24], 24, v[1:2]
 ; GFX9-NEXT:  .LBB108_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB108_4
@@ -88072,7 +88160,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    s_mov_b32 s7, 0x7060302
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v19, v20, vcc
-; GFX9-NEXT:    v_perm_b32 v23, v1, v18, s7
+; GFX9-NEXT:    v_perm_b32 v27, v1, v18, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; GFX9-NEXT:    v_bfe_u32 v19, v1, 16, 1
@@ -88101,7 +88189,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v21, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc
-; GFX9-NEXT:    v_perm_b32 v30, v3, v19, s7
+; GFX9-NEXT:    v_perm_b32 v29, v3, v19, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX9-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; GFX9-NEXT:    v_bfe_u32 v20, v3, 16, 1
@@ -88130,7 +88218,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v22, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v22, vcc
-; GFX9-NEXT:    v_perm_b32 v32, v5, v20, s7
+; GFX9-NEXT:    v_perm_b32 v31, v5, v20, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
 ; GFX9-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
@@ -88150,281 +88238,278 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_bfe_u32 v22, v21, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v21, s6
-; GFX9-NEXT:    v_or_b32_e32 v25, 0x400000, v21
+; GFX9-NEXT:    v_or_b32_e32 v23, 0x400000, v21
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX9-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v22, v25, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v22, v23, vcc
 ; GFX9-NEXT:    v_bfe_u32 v22, v7, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v7, s6
-; GFX9-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v23, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v22, v25, vcc
-; GFX9-NEXT:    v_perm_b32 v34, v7, v21, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v22, v23, vcc
+; GFX9-NEXT:    v_perm_b32 v33, v7, v21, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
 ; GFX9-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; GFX9-NEXT:    v_bfe_u32 v22, v7, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v7, s6
-; GFX9-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v23, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v22, v25, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v22, v23, vcc
 ; GFX9-NEXT:    v_bfe_u32 v22, v10, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v10, s6
-; GFX9-NEXT:    v_or_b32_e32 v25, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v23, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v22, v25, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v22, v23, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v9
 ; GFX9-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; GFX9-NEXT:    v_bfe_u32 v25, v22, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v23, v22, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_add3_u32 v25, v25, v22, s6
-; GFX9-NEXT:    v_or_b32_e32 v26, 0x400000, v22
+; GFX9-NEXT:    v_add3_u32 v23, v23, v22, s6
+; GFX9-NEXT:    v_or_b32_e32 v24, 0x400000, v22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX9-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v25, v26, vcc
-; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s6
-; GFX9-NEXT:    v_or_b32_e32 v26, 0x400000, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX9-NEXT:    v_bfe_u32 v23, v9, 16, 1
+; GFX9-NEXT:    v_add3_u32 v23, v23, v9, s6
+; GFX9-NEXT:    v_or_b32_e32 v24, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v26, vcc
-; GFX9-NEXT:    v_perm_b32 v36, v9, v22, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v23, v24, vcc
+; GFX9-NEXT:    v_perm_b32 v35, v9, v22, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
 ; GFX9-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v23, v9, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s6
-; GFX9-NEXT:    v_or_b32_e32 v26, 0x400000, v9
+; GFX9-NEXT:    v_add3_u32 v23, v23, v9, s6
+; GFX9-NEXT:    v_or_b32_e32 v24, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX9-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v26, vcc
-; GFX9-NEXT:    v_bfe_u32 v25, v12, 16, 1
-; GFX9-NEXT:    v_add3_u32 v25, v25, v12, s6
-; GFX9-NEXT:    v_or_b32_e32 v26, 0x400000, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v23, v24, vcc
+; GFX9-NEXT:    v_bfe_u32 v23, v12, 16, 1
+; GFX9-NEXT:    v_add3_u32 v23, v23, v12, s6
+; GFX9-NEXT:    v_or_b32_e32 v24, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v25, v26, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v11
-; GFX9-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; GFX9-NEXT:    v_bfe_u32 v26, v25, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v23, v24, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v11
+; GFX9-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX9-NEXT:    v_bfe_u32 v24, v23, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_add3_u32 v26, v26, v25, s6
-; GFX9-NEXT:    v_or_b32_e32 v27, 0x400000, v25
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX9-NEXT:    v_add3_u32 v24, v24, v23, s6
+; GFX9-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX9-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v26, v27, vcc
-; GFX9-NEXT:    v_bfe_u32 v26, v11, 16, 1
-; GFX9-NEXT:    v_add3_u32 v26, v26, v11, s6
-; GFX9-NEXT:    v_or_b32_e32 v27, 0x400000, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX9-NEXT:    v_bfe_u32 v24, v11, 16, 1
+; GFX9-NEXT:    v_add3_u32 v24, v24, v11, s6
+; GFX9-NEXT:    v_or_b32_e32 v25, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v26, v27, vcc
-; GFX9-NEXT:    v_perm_b32 v38, v11, v25, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v24, v25, vcc
+; GFX9-NEXT:    v_perm_b32 v37, v11, v23, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
 ; GFX9-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX9-NEXT:    v_bfe_u32 v26, v11, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v24, v11, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_add3_u32 v26, v26, v11, s6
-; GFX9-NEXT:    v_or_b32_e32 v27, 0x400000, v11
+; GFX9-NEXT:    v_add3_u32 v24, v24, v11, s6
+; GFX9-NEXT:    v_or_b32_e32 v25, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v26, v27, vcc
-; GFX9-NEXT:    v_bfe_u32 v26, v14, 16, 1
-; GFX9-NEXT:    v_add3_u32 v26, v26, v14, s6
-; GFX9-NEXT:    v_or_b32_e32 v27, 0x400000, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v24, v25, vcc
+; GFX9-NEXT:    v_bfe_u32 v24, v14, 16, 1
+; GFX9-NEXT:    v_add3_u32 v24, v24, v14, s6
+; GFX9-NEXT:    v_or_b32_e32 v25, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v26, v27, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v13
-; GFX9-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; GFX9-NEXT:    v_bfe_u32 v27, v26, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v24, v25, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v13
+; GFX9-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX9-NEXT:    v_bfe_u32 v25, v24, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_add3_u32 v27, v27, v26, s6
-; GFX9-NEXT:    v_or_b32_e32 v28, 0x400000, v26
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX9-NEXT:    v_add3_u32 v25, v25, v24, s6
+; GFX9-NEXT:    v_or_b32_e32 v26, 0x400000, v24
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX9-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v27, v28, vcc
-; GFX9-NEXT:    v_bfe_u32 v27, v13, 16, 1
-; GFX9-NEXT:    v_add3_u32 v27, v27, v13, s6
-; GFX9-NEXT:    v_or_b32_e32 v28, 0x400000, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX9-NEXT:    v_bfe_u32 v25, v13, 16, 1
+; GFX9-NEXT:    v_add3_u32 v25, v25, v13, s6
+; GFX9-NEXT:    v_or_b32_e32 v26, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v27, v28, vcc
-; GFX9-NEXT:    v_perm_b32 v48, v13, v26, s7
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v25, v26, vcc
+; GFX9-NEXT:    v_perm_b32 v48, v13, v24, s7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
 ; GFX9-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX9-NEXT:    v_bfe_u32 v27, v13, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v25, v13, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_add3_u32 v27, v27, v13, s6
-; GFX9-NEXT:    v_or_b32_e32 v28, 0x400000, v13
+; GFX9-NEXT:    v_add3_u32 v25, v25, v13, s6
+; GFX9-NEXT:    v_or_b32_e32 v26, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX9-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v27, v28, vcc
-; GFX9-NEXT:    v_bfe_u32 v27, v16, 16, 1
-; GFX9-NEXT:    v_add3_u32 v27, v27, v16, s6
-; GFX9-NEXT:    v_or_b32_e32 v28, 0x400000, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v25, v26, vcc
+; GFX9-NEXT:    v_bfe_u32 v25, v16, 16, 1
+; GFX9-NEXT:    v_add3_u32 v25, v25, v16, s6
+; GFX9-NEXT:    v_or_b32_e32 v26, 0x400000, v16
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v27, v28, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v15
-; GFX9-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; GFX9-NEXT:    v_bfe_u32 v28, v27, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v25, v26, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX9-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX9-NEXT:    v_bfe_u32 v26, v25, 16, 1
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_add3_u32 v28, v28, v27, s6
-; GFX9-NEXT:    v_or_b32_e32 v29, 0x400000, v27
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; GFX9-NEXT:    v_add3_u32 v26, v26, v25, s6
+; GFX9-NEXT:    v_or_b32_e32 v39, 0x400000, v25
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX9-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v28, v29, vcc
-; GFX9-NEXT:    v_bfe_u32 v28, v15, 16, 1
-; GFX9-NEXT:    v_add3_u32 v28, v28, v15, s6
-; GFX9-NEXT:    v_or_b32_e32 v29, 0x400000, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v26, v39, vcc
+; GFX9-NEXT:    v_bfe_u32 v26, v15, 16, 1
+; GFX9-NEXT:    v_add3_u32 v26, v26, v15, s6
+; GFX9-NEXT:    v_or_b32_e32 v39, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v28, v29, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v26, v39, vcc
 ; GFX9-NEXT:    v_perm_b32 v51, v16, v13, s7
-; GFX9-NEXT:    v_perm_b32 v50, v15, v27, s7
-; GFX9-NEXT:    v_perm_b32 v24, v2, v17, s7
-; GFX9-NEXT:    v_perm_b32 v31, v4, v1, s7
+; GFX9-NEXT:    v_perm_b32 v50, v15, v25, s7
+; GFX9-NEXT:    v_perm_b32 v28, v2, v17, s7
+; GFX9-NEXT:    v_perm_b32 v30, v4, v1, s7
 ; GFX9-NEXT:    v_perm_b32 v49, v14, v11, s7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v46, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
 ; GFX9-NEXT:    v_lshrrev_b64 v[17:18], 24, v[50:51]
-; GFX9-NEXT:    v_perm_b32 v33, v6, v3, s7
-; GFX9-NEXT:    v_perm_b32 v39, v12, v9, s7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 16, v6
+; GFX9-NEXT:    v_perm_b32 v32, v6, v3, s7
+; GFX9-NEXT:    v_perm_b32 v38, v12, v9, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 16, v6
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b64 v[18:19], 24, v[48:49]
-; GFX9-NEXT:    v_perm_b32 v35, v8, v5, s7
-; GFX9-NEXT:    v_perm_b32 v37, v10, v7, s7
+; GFX9-NEXT:    v_perm_b32 v34, v8, v5, s7
+; GFX9-NEXT:    v_perm_b32 v36, v10, v7, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[19:20], 24, v[37:38]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[20:21], 24, v[35:36]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v14
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v20
-; GFX9-NEXT:    v_lshrrev_b64 v[19:20], 24, v[38:39]
-; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v26
-; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v25
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v21
-; GFX9-NEXT:    v_lshrrev_b64 v[20:21], 24, v[36:37]
-; GFX9-NEXT:    v_lshrrev_b64 v[25:26], 24, v[30:31]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v27
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX9-NEXT:    v_lshrrev_b64 v[21:22], 24, v[34:35]
-; GFX9-NEXT:    v_lshrrev_b64 v[28:29], 24, v[32:33]
-; GFX9-NEXT:    v_lshrrev_b64 v[26:27], 24, v[23:24]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 24, v51
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v51
-; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 16, v50
-; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v50
-; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 24, v49
+; GFX9-NEXT:    v_lshrrev_b64 v[21:22], 24, v[33:34]
+; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[22:23], 24, v[29:30]
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v25
+; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[25:26], 24, v[31:32]
+; GFX9-NEXT:    v_lshrrev_b64 v[23:24], 24, v[27:28]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 24, v51
+; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v51
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 16, v50
+; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v50
+; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 24, v49
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v49
-; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 16, v48
+; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 16, v48
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
-; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 24, v39
-; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v39
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v38
+; GFX9-NEXT:    v_lshrrev_b32_e32 v46, 24, v38
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 8, v38
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 24, v37
+; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 16, v37
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v37
-; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v36
+; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 24, v36
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
-; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 24, v35
+; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 16, v35
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
-; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v34
+; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 24, v34
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
-; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 24, v33
+; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 16, v33
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
-; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 16, v32
+; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 24, v32
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v32
-; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 24, v31
+; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v31
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v31
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 16, v30
+; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 24, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 8, v30
-; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 24, v24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 8, v24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 8, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 8, v29
+; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 24, v28
+; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v28
+; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 8, v27
 ; GFX9-NEXT:  .LBB108_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v23
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v53
-; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v26
-; GFX9-NEXT:    v_or_b32_sdwa v22, v24, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v27
+; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v23
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v23, v28, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v51
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v52
-; GFX9-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v50
+; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v30
-; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v22
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v29
+; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v31
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v50
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v30
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v63
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v32
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v28
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v31
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v25
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v33
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v62
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v32
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v61
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v33
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v21
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v61, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v35
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v60
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v59
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v35
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v20
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v37
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v57
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v56
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v38
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v37
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v19
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v39
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v47
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v38
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v46
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -88433,26 +88518,26 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v48
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v49
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v44
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v43
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v42
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v41
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v17
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v40
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v55
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v55
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v54
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -88481,62 +88566,62 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -88549,18 +88634,18 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v7
@@ -88573,247 +88658,246 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.h, v16.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v14.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v12.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.h, v13.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.h, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v16.h
 ; GFX11-TRUE16-NEXT:  .LBB108_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB108_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v17, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v17, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v2, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v20, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v20, v22, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v26.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v28.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v21, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v20, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v28.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v17, v22, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v19, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v20, v1
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v18, v22, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v19
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v19, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v21, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v30.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v18, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v31, v18, v19 :: v_dual_add_f32 v18, 0x40c00000, v22
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v22 :: v_dual_cndmask_b32 v3, v4, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v31.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v18, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v17, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v17, v29
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v34.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v6, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v19, v3
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v6, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v17, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v29, v19, v22 :: v_dual_add_f32 v20, 0x40c00000, v20
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v19, v22, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v38, v17, v21 :: v_dual_add_f32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v20, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v38.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v32.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v17, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v33.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v35.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v6, v22, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v19, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v17, v8, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v34, v6, v17 :: v_dual_add_f32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v18, v33
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v34.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v36, v6, v17 :: v_dual_and_b32 v23, 0xffff0000, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v23
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v18, v29
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v32, v5, v22 :: v_dual_and_b32 v23, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v36, v5, v22 :: v_dual_and_b32 v23, 0xffff0000, v7
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v21, v20
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v20, 16, v10
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v5
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v8
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v20, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 24, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v50, v19, v21, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v49, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v20, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v18, v22, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v50.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v39, v7, v21 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v10, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v49.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v38, v7, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v35, v19, v22 :: v_dual_lshlrev_b32 v10, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v48, v19, v22 :: v_dual_lshlrev_b32 v7, 16, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v38.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v7, 0x40c00000, v7
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v19, v48
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v19, v35
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v49, v22, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v23 :: v_dual_add_f32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v12, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v9, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v23
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v12, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v12
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v39 :: v_dual_cndmask_b32 v52, v22, v37
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v52.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v65, v19, v25, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v48 :: v_dual_add_f32 v14, 0x40c00000, v14
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v49.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v37, v24, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v53, v24, v50, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v7, v37
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v10
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v7, v53
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v20, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v11
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v19, v21, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v21
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v65.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v65.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 24, v12
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v19, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v23, v7, 0x7fff
@@ -88822,27 +88906,28 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v68, v21, v22
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v20, v9
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v70, v21, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v24, v19, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v25, v14, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v14
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v48, v21, v22
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v67, v21, v22, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v25, v7, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v16
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v64, v23, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v23, v24, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v20, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v12
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v64.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v68.h
 ; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v14, v19 :: v_dual_add_f32 v14, 0x40c00000, v21
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v15
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v23, v13, 0x7fff
@@ -88852,42 +88937,42 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v19, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v16, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v14, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v21
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v69, v13, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v13, v25, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v24, v14, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v14
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v15, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v23, v52, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v23, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v81.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v51, v15, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v68.h
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v22, v48
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v54, v19, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v85.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v37, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v70.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v22, v67
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v19, v24, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v69.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v71.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v81.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v84.h
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v23, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 24, v14
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v19, v54
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v19, v82
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v9
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v15, v13
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v21, v7
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v18, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
@@ -88896,22 +88981,21 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v7
 ; GFX11-TRUE16-NEXT:  .LBB108_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v28.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v113.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v24.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v27.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v31.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v1.h, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v26.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v102.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -88927,13 +89011,13 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v8
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v35.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v30.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v29.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v100.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
@@ -88943,19 +89027,19 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v14, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v50.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v49.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v96.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v10, v12
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v34.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v87.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v86.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v83.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v16, v22
@@ -88966,15 +89050,15 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v32.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v98.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v71.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
@@ -88987,24 +89071,24 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v23, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v69.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v9
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v21
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v22
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v14, v19
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v81.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v55.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v33.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v97.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
@@ -89021,16 +89105,16 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v13
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v50.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
@@ -89062,107 +89146,107 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB108_2
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[7:8]
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[28:29], 24, v[5:6]
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[11:12]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[29:30], 24, v[3:4]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 24, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 24, v10
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v10
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 24, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[9:10]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[30:31], 24, v[1:2]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-FAKE16-NEXT:  .LBB108_2: ; %Flow
 ; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB108_4
@@ -89195,227 +89279,222 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v19
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v2
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v21, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v22, v17, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v2, v1, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v23, vcc_lo
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v23, 16, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v18
-; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v19, v17, 0x7060302
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v26, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v19, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v22, v24 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v20, v22, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v23
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v21
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v19, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v26, v23, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v19, v24 :: v_dual_lshlrev_b32 v24, 16, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v26
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v19, v23 :: v_dual_lshlrev_b32 v23, 16, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v24, 0x40c00000, v24
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v23, 0x40c00000, v25 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v3, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v24, v21, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v4, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v19, v22 :: v_dual_lshlrev_b32 v22, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v3, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v6
-; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v22, v3, 0x7fff
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v24, 16, 1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v22, v26, vcc_lo
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v23, 16, 1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v4, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v3, v19, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v20, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v3, v19, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v22, v23, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v21, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v6, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v23 :: v_dual_add_f32 v5, 0x40c00000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v29, v24, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v28, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v97, v3, v22, 0x7060302
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v6, v27 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v28 :: v_dual_lshlrev_b32 v27, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v5, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v25, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v6, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v24
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v23, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v23, v29 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v86, v3, v20, 0x7060302
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v21, v25 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v6, 16, 1
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-FAKE16-NEXT:    v_perm_b32 v96, v5, v24, 0x7060302
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v85, v5, v22, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v23, v6, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v96
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v85
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v8, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v21, v24, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v96
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v29, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v85, v5, v6, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v27, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v27
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_perm_b32 v83, v5, v6, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
-; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v30, v27, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v85
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v8, v28 :: v_dual_lshlrev_b32 v28, 16, v9
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v30, v23, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v8, v24 :: v_dual_lshlrev_b32 v24, 16, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v25
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v28, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v8
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v7, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, 0x400000, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v23, v7, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v23, v30, vcc_lo
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v21, v30, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v8, 16, 1
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, 0x400000, v10
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v84, v7, v27, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v82, v7, v23, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v10, 16, 1
-; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v23, v8, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v84
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v82
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v10, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v23, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v21, v25, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v31, v28, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v31, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v82
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v30 :: v_dual_lshlrev_b32 v30, 16, v12
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v81, v7, v8, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v70, v7, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v7
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v10, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v10, v25, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v27
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v81
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v70
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v12, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v30, v12, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v9, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v9
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v9, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v23, v31, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v11
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v21, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v10, 16, 1
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-FAKE16-NEXT:    v_perm_b32 v80, v9, v28, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v69, v9, v24, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v23
-; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v29, v10, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v25, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v10
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v12
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v9, 16, 1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v80
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v23, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v69
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v21, v25, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v9
-; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v32, v9, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v80
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v32, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 8, v69
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v30, v31, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v14
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-FAKE16-NEXT:    v_perm_b32 v65, v12, v10, 0x7060302
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v23, v29, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    v_perm_b32 v55, v12, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v21, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v30
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v11, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, 0x400000, v11
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v23, 16, 1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v65
-; GFX11-FAKE16-NEXT:    v_add3_u32 v29, v31, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v55
+; GFX11-FAKE16-NEXT:    v_add3_u32 v25, v31, v11, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v29, v30, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v31
-; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v32, v23, 0x7fff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v25, v30, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v32, v21, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v21
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v14, 16, 1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v25, 16, 1
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v29
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v30, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v30, v31, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v32, v14, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v14
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v33, v29, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v33, v25, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v30, v31, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v33
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-FAKE16-NEXT:    v_perm_b32 v64, v11, v9, 0x7060302
-; GFX11-FAKE16-NEXT:    v_perm_b32 v52, v14, v23, 0x7060302
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v29, v32, v34 :: v_dual_lshlrev_b32 v34, 16, v15
+; GFX11-FAKE16-NEXT:    v_perm_b32 v54, v11, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v50, v14, v21, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v25, v32, v34 :: v_dual_lshlrev_b32 v34, 16, v15
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v13, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, 0x400000, v13
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v30, 16, 1
@@ -89423,8 +89502,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v31, v35, v13, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v28
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[84:85]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v50
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v31, v32, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v34
@@ -89433,190 +89512,191 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v16, 16, 1
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v15, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v31
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v15
-; GFX11-FAKE16-NEXT:    v_perm_b32 v51, v13, v29, 0x7060302
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v33, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v34, v16, 0x7fff
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v16
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v15, 16, 1
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v29
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[28:29], 24, v[96:97]
+; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v30, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v49, v13, v25, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v25
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v33, v34, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v15, 0x7fff
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v23
-; GFX11-FAKE16-NEXT:    v_perm_b32 v39, v16, v30, 0x7060302
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v35, v36, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v32, v37, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v22
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[64:65]
-; GFX11-FAKE16-NEXT:    v_perm_b32 v38, v15, v31, 0x7060302
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[29:30], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v49
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v54
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v30, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v37, v16, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v36, v15, v31, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v31
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[80:81]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[30:31], 24, v[20:21]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[38:39]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[51:52]
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 24, v39
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v39
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v38
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v38
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 24, v52
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v52
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v51
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v51
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v65
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v64
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v64
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 24, v81
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v85
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v84
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 24, v97
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v97
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v26
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v25
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[49:50]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[54:55]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[69:70]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[82:83]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[85:86]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[28:29]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[26:27]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v50
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 24, v55
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v70
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 24, v83
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v83
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v86
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v86
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v26
 ; GFX11-FAKE16-NEXT:  .LBB108_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v99
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v20, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v96
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v98
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v26
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v29
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v21, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v30, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v96
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v26, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v86
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v87
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v86
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v22
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v85
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v67
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v31, 8, v84
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v68, 8, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v81, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v28, v22
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v29
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v30, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v65, v68
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v21
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v26
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v82
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v81
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v80
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v54
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v71
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v69
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v69
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v20, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v23, v21
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v25, v26
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v68
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v20
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v66
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v55
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v64
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v55
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v20
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v24
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v25, v26
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v28, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v28, v19
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v20
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v19
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v50
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v51
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v48
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v18
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v39
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v20, v21
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v22
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v23, v18
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v24
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v37
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v35
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v32
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v22
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v23, v17
@@ -89686,21 +89766,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v2
 ; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v1
 ; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v3
 ; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v6
 ; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v5
 ; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v7
 ; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v9
 ; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v11
 ; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v14
-; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v16
 ; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v17
 ; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s18
@@ -89714,19 +89794,25 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s26
 ; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s29
 ; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
@@ -89782,219 +89868,213 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
-; SI-NEXT:    v_alignbit_b32 v6, v1, v29, 16
+; SI-NEXT:    v_alignbit_b32 v6, v1, v28, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; SI-NEXT:    v_alignbit_b32 v3, v1, v32, 16
-; SI-NEXT:    v_mov_b32_e32 v48, v12
+; SI-NEXT:    v_alignbit_b32 v3, v1, v34, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v44
-; SI-NEXT:    v_alignbit_b32 v2, v1, v35, 16
-; SI-NEXT:    v_mov_b32_e32 v51, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
-; SI-NEXT:    v_alignbit_b32 v14, v12, v28, 16
-; SI-NEXT:    v_alignbit_b32 v4, v14, v3, 24
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v56
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v14, v3, 16
-; SI-NEXT:    v_alignbit_b32 v8, v7, v38, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v2, v1, v35, 16
+; SI-NEXT:    v_alignbit_b32 v8, v7, v33, 16
 ; SI-NEXT:    v_alignbit_b32 v4, v8, v2, 24
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v36
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v8, v2, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v8, v2, 8
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v48
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v39, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v43
-; SI-NEXT:    v_alignbit_b32 v5, v4, v45, 16
-; SI-NEXT:    v_mov_b32_e32 v37, v32
-; SI-NEXT:    v_mov_b32_e32 v32, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v44
+; SI-NEXT:    v_alignbit_b32 v5, v4, v32, 16
+; SI-NEXT:    v_mov_b32_e32 v31, v23
 ; SI-NEXT:    v_alignbit_b32 v20, v18, v23, 16
+; SI-NEXT:    v_alignbit_b32 v14, v12, v29, 16
 ; SI-NEXT:    v_alignbit_b32 v23, v5, v1, 24
+; SI-NEXT:    v_mov_b32_e32 v38, v36
+; SI-NEXT:    v_alignbit_b32 v36, v20, v6, 24
+; SI-NEXT:    v_alignbit_b32 v25, v14, v3, 24
+; SI-NEXT:    v_alignbit_b32 v50, v8, v2, 16
+; SI-NEXT:    v_mov_b32_e32 v53, v32
 ; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v23, v5, v1, 16
-; SI-NEXT:    v_mov_b32_e32 v34, v29
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v23, v5, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v29, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 24, v26
-; SI-NEXT:    v_mov_b32_e32 v26, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v42
-; SI-NEXT:    v_mov_b32_e32 v42, v33
-; SI-NEXT:    v_mov_b32_e32 v36, v31
-; SI-NEXT:    v_mov_b32_e32 v49, v35
-; SI-NEXT:    v_mov_b32_e32 v52, v39
-; SI-NEXT:    v_alignbit_b32 v53, v20, v6, 24
+; SI-NEXT:    v_alignbit_b32 v32, v5, v1, 8
 ; SI-NEXT:    v_alignbit_b32 v55, v20, v6, 16
 ; SI-NEXT:    v_alignbit_b32 v40, v20, v6, 8
-; SI-NEXT:    v_mov_b32_e32 v35, v28
+; SI-NEXT:    v_mov_b32_e32 v35, v29
+; SI-NEXT:    v_alignbit_b32 v52, v14, v3, 16
 ; SI-NEXT:    v_alignbit_b32 v54, v14, v3, 8
-; SI-NEXT:    v_mov_b32_e32 v39, v38
-; SI-NEXT:    v_mov_b32_e32 v50, v45
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v25, v22
+; SI-NEXT:    v_mov_b32_e32 v37, v33
+; SI-NEXT:    v_alignbit_b32 v51, v8, v2, 8
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshrrev_b32_e32 v22, 24, v22
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 8, v30
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v23, v41
 ; SI-NEXT:    v_lshrrev_b32_e32 v41, 24, v41
 ; SI-NEXT:    v_lshrrev_b32_e32 v46, 8, v19
+; SI-NEXT:    v_mov_b32_e32 v28, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 24, v26
 ; SI-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
+; SI-NEXT:    v_mov_b32_e32 v26, v42
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v42
 ; SI-NEXT:    v_lshrrev_b32_e32 v58, 8, v11
-; SI-NEXT:    v_mov_b32_e32 v31, v43
+; SI-NEXT:    v_mov_b32_e32 v29, v43
 ; SI-NEXT:    v_lshrrev_b32_e32 v59, 24, v43
 ; SI-NEXT:    v_lshrrev_b32_e32 v47, 8, v20
-; SI-NEXT:    v_mov_b32_e32 v28, v44
+; SI-NEXT:    v_mov_b32_e32 v34, v44
 ; SI-NEXT:    v_lshrrev_b32_e32 v45, 24, v44
 ; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v14
-; SI-NEXT:    v_mov_b32_e32 v38, v56
+; SI-NEXT:    v_mov_b32_e32 v33, v56
 ; SI-NEXT:    v_lshrrev_b32_e32 v43, 24, v56
 ; SI-NEXT:    v_lshrrev_b32_e32 v56, 8, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 24, v42
+; SI-NEXT:    v_mov_b32_e32 v49, v48
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 24, v48
+; SI-NEXT:    v_mov_b32_e32 v48, v32
+; SI-NEXT:    v_mov_b32_e32 v32, v50
+; SI-NEXT:    v_mov_b32_e32 v50, v25
+; SI-NEXT:    v_mov_b32_e32 v25, v36
+; SI-NEXT:    v_mov_b32_e32 v36, v38
 ; SI-NEXT:    v_lshrrev_b32_e32 v44, 8, v5
 ; SI-NEXT:    s_cbranch_execnz .LBB109_3
 ; SI-NEXT:  .LBB109_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v28
-; SI-NEXT:    v_add_f32_e32 v44, 0x40c00000, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v51
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v52
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v36
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v39
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v49
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v50
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v53
 ; SI-NEXT:    v_add_f32_e32 v42, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v42
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v48
 ; SI-NEXT:    v_alignbit_b32 v5, v4, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v49
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v43, 0x40c00000, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v43
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v34
+; SI-NEXT:    v_add_f32_e32 v44, 0x40c00000, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v44
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v45, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v45
+; SI-NEXT:    v_alignbit_b32 v48, v5, v1, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 24, v43
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 24, v42
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v38
 ; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v39
-; SI-NEXT:    v_add_f32_e32 v43, 0x40c00000, v6
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v37
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v43
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v36
 ; SI-NEXT:    v_alignbit_b32 v8, v7, v3, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v37
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v32, v8, v2, 16
+; SI-NEXT:    v_alignbit_b32 v51, v8, v2, 8
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_alignbit_b32 v3, v6, v3, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v35
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v31
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v44
-; SI-NEXT:    v_add_f32_e32 v45, 0x40c00000, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v14, v12, v6, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v34
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v45
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v54, v14, v3, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 24, v43
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 24, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v14
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_alignbit_b32 v6, v9, v6, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v32
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_alignbit_b32 v20, v18, v9, 16
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v53, v20, v6, 24
-; SI-NEXT:    v_alignbit_b32 v55, v20, v6, 16
-; SI-NEXT:    v_alignbit_b32 v40, v20, v6, 8
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_alignbit_b32 v21, v19, v17, 16
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
 ; SI-NEXT:    v_alignbit_b32 v15, v15, v13, 16
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v21, v19, v17, 16
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_alignbit_b32 v10, v10, v9, 16
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_alignbit_b32 v3, v6, v3, 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v35
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_alignbit_b32 v14, v12, v6, 16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v50, v14, v3, 24
+; SI-NEXT:    v_alignbit_b32 v52, v14, v3, 16
+; SI-NEXT:    v_alignbit_b32 v54, v14, v3, 8
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v14
 ; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v13
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v17
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v28
 ; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
 ; SI-NEXT:    v_add_f32_e32 v41, 0x40c00000, v17
+; SI-NEXT:    v_add_f32_e32 v56, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v41
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v56
 ; SI-NEXT:    v_lshrrev_b32_e32 v41, 24, v41
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 24, v56
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 8, v8
 ; SI-NEXT:    v_alignbit_b32 v19, v17, v19, 16
+; SI-NEXT:    v_alignbit_b32 v16, v13, v16, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v46, 8, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_alignbit_b32 v6, v9, v6, 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v31
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_alignbit_b32 v20, v18, v9, 16
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v25, v20, v6, 24
+; SI-NEXT:    v_alignbit_b32 v55, v20, v6, 16
+; SI-NEXT:    v_alignbit_b32 v40, v20, v6, 8
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v9
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v26
-; SI-NEXT:    v_add_f32_e32 v47, 0x40c00000, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v47
-; SI-NEXT:    v_alignbit_b32 v11, v9, v11, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 8, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 8, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
 ; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
 ; SI-NEXT:    v_alignbit_b32 v27, v23, v22, 16
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v25
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v59, 0x40c00000, v23
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v13
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v59
-; SI-NEXT:    v_add_f32_e32 v56, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v56
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 24, v56
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 8, v8
-; SI-NEXT:    v_alignbit_b32 v16, v13, v16, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_alignbit_b32 v10, v10, v9, 16
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT:    v_add_f32_e32 v59, 0x40c00000, v23
 ; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v59
 ; SI-NEXT:    v_alignbit_b32 v30, v24, v22, 16
 ; SI-NEXT:    v_alignbit_b32 v22, v30, v27, 24
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 8, v30
@@ -90023,8 +90103,18 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v22, v16, v15, 8
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v9
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v26
+; SI-NEXT:    v_add_f32_e32 v47, 0x40c00000, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v47
+; SI-NEXT:    v_alignbit_b32 v11, v9, v11, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v22, v11, v10, 24
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 8, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v47
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 8, v20
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v22, v11, v10, 16
@@ -90033,30 +90123,15 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_alignbit_b32 v22, v11, v10, 8
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, v14, v3, 24
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, v14, v3, 16
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v22, v8, v2, 24
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, v8, v2, 16
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, v8, v2, 8
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v22, v5, v1, 24
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v22, v5, v1, 16
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, v5, v1, 8
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v22, 24, v59
 ; SI-NEXT:    v_lshrrev_b32_e32 v59, 24, v45
 ; SI-NEXT:    v_lshrrev_b32_e32 v45, 24, v44
@@ -90189,7 +90264,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v9
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v55
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v25
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; SI-NEXT:    v_or_b32_e32 v9, v10, v9
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v9
@@ -90210,14 +90285,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v54
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v50
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    v_or_b32_e32 v6, v9, v6
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 40, v0
@@ -90234,19 +90305,15 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v51
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
 ; SI-NEXT:    v_or_b32_e32 v3, v6, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
@@ -90264,18 +90331,16 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v48
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
@@ -90314,56 +90379,39 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr4
-; SI-NEXT:    v_mov_b32_e32 v52, v39
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    v_mov_b32_e32 v51, v14
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v50, v45
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v49, v35
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v48, v12
+; SI-NEXT:    v_mov_b32_e32 v53, v32
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v39, v38
+; SI-NEXT:    v_mov_b32_e32 v49, v48
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v38, v56
+; SI-NEXT:    v_mov_b32_e32 v37, v33
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v37, v32
+; SI-NEXT:    v_mov_b32_e32 v33, v56
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v36, v31
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v35, v29
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v35, v28
+; SI-NEXT:    v_mov_b32_e32 v34, v44
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v28, v44
+; SI-NEXT:    v_mov_b32_e32 v31, v23
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v34, v29
+; SI-NEXT:    v_mov_b32_e32 v29, v43
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v32, v23
+; SI-NEXT:    v_mov_b32_e32 v28, v26
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v31, v43
+; SI-NEXT:    v_mov_b32_e32 v26, v42
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v29, v26
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v26, v42
 ; SI-NEXT:    v_mov_b32_e32 v23, v41
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v25, v22
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr4
@@ -90391,28 +90439,33 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr20
 ; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; kill: killed $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    s_branch .LBB109_2
 ;
@@ -91188,27 +91241,27 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:    s_lshr_b32 s60, s22, 8
 ; GFX9-NEXT:    s_lshr_b32 s48, s21, 24
 ; GFX9-NEXT:    s_lshr_b32 s78, s21, 16
-; GFX9-NEXT:    s_lshr_b32 s45, s21, 8
+; GFX9-NEXT:    s_lshr_b32 s59, s21, 8
 ; GFX9-NEXT:    s_lshr_b32 s49, s20, 16
-; GFX9-NEXT:    s_lshr_b32 s44, s20, 8
+; GFX9-NEXT:    s_lshr_b32 s58, s20, 8
 ; GFX9-NEXT:    s_lshr_b32 s50, s19, 24
 ; GFX9-NEXT:    s_lshr_b32 s77, s19, 16
-; GFX9-NEXT:    s_lshr_b32 s13, s19, 8
+; GFX9-NEXT:    s_lshr_b32 s57, s19, 8
 ; GFX9-NEXT:    s_lshr_b32 s51, s18, 16
-; GFX9-NEXT:    s_lshr_b32 s12, s18, 8
+; GFX9-NEXT:    s_lshr_b32 s56, s18, 8
 ; GFX9-NEXT:    s_lshr_b32 s52, s17, 24
 ; GFX9-NEXT:    s_lshr_b32 s76, s17, 16
 ; GFX9-NEXT:    s_lshr_b32 s53, s17, 8
 ; GFX9-NEXT:    s_lshr_b32 s54, s16, 16
 ; GFX9-NEXT:    s_lshr_b32 s55, s16, 8
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[4:5], 24
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[28:29], 24
-; GFX9-NEXT:    s_lshr_b64 s[14:15], s[26:27], 24
-; GFX9-NEXT:    s_lshr_b64 s[40:41], s[24:25], 24
-; GFX9-NEXT:    s_lshr_b64 s[42:43], s[22:23], 24
-; GFX9-NEXT:    s_lshr_b64 s[46:47], s[20:21], 24
-; GFX9-NEXT:    s_lshr_b64 s[56:57], s[18:19], 24
-; GFX9-NEXT:    s_lshr_b64 s[58:59], s[16:17], 24
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[28:29], 24
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[26:27], 24
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[24:25], 24
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[22:23], 24
+; GFX9-NEXT:    s_lshr_b64 s[40:41], s[20:21], 24
+; GFX9-NEXT:    s_lshr_b64 s[42:43], s[18:19], 24
+; GFX9-NEXT:    s_lshr_b64 s[44:45], s[16:17], 24
 ; GFX9-NEXT:    s_cbranch_execnz .LBB109_3
 ; GFX9-NEXT:  .LBB109_2: ; %cmp.true
 ; GFX9-NEXT:    s_and_b32 s6, s17, 0xffff0000
@@ -91237,357 +91290,357 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:    s_and_b32 s6, s16, 0xffff0000
 ; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX9-NEXT:    s_bfe_u32 s8, s6, 0x10010
-; GFX9-NEXT:    s_add_i32 s8, s8, s6
-; GFX9-NEXT:    s_add_i32 s10, s8, 0x7fff
-; GFX9-NEXT:    s_bitset1_b32 s6, 22
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s6, s6, s10
-; GFX9-NEXT:    s_lshl_b32 s8, s16, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s16, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s16, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s19, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
+; GFX9-NEXT:    s_lshr_b32 s16, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s19, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s46, s16, s8
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s77, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s19, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s77, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s19, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s19, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s18, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s19, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s18, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s18, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s11, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s12, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s18, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s12, s11
-; GFX9-NEXT:    s_lshr_b32 s18, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s21, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s12, s18, s10
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
+; GFX9-NEXT:    s_lshr_b32 s18, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s21, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s56, s18, s8
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s78, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s21, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s78, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s21, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s21, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s20, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s21, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s20, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s20, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s11, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s14, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s20, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s14, s11
-; GFX9-NEXT:    s_lshr_b32 s20, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s23, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s44, s20, s10
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
+; GFX9-NEXT:    s_lshr_b32 s20, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s23, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s58, s20, s8
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s79, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s23, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s79, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s23, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s23, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s22, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s23, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s22, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s22, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s11, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s14, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s22, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s14, s11
-; GFX9-NEXT:    s_lshr_b32 s22, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s25, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s60, s22, s10
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
+; GFX9-NEXT:    s_lshr_b32 s22, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s25, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s60, s22, s8
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s88, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s25, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s88, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s25, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s25, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s24, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s25, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s24, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s24, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s11, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s14, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s24, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s14, s11
-; GFX9-NEXT:    s_lshr_b32 s24, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s27, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s62, s24, s10
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
+; GFX9-NEXT:    s_lshr_b32 s24, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s27, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s62, s24, s8
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s89, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s27, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s89, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s27, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s27, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s26, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s27, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s26, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s26, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s11, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s14, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s26, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s14, s11
-; GFX9-NEXT:    s_lshr_b32 s26, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s29, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s72, s26, s10
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
+; GFX9-NEXT:    s_lshr_b32 s26, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s29, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s72, s26, s8
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s90, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s29, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s90, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s29, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s29, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s28, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s29, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s28, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
-; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
-; GFX9-NEXT:    s_lshl_b32 s8, s28, 16
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_add_i32 s11, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s14, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s28, 16
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_add_i32 s9, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s10, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s14, s11
-; GFX9-NEXT:    s_lshr_b32 s28, s8, 16
-; GFX9-NEXT:    s_and_b32 s8, s5, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s74, s28, s10
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
+; GFX9-NEXT:    s_lshr_b32 s28, s6, 16
+; GFX9-NEXT:    s_and_b32 s6, s5, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_pack_ll_b32_b16 s74, s28, s8
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX9-NEXT:    v_add_f32_e32 v2, s5, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
-; GFX9-NEXT:    s_lshr_b32 s91, s8, 16
-; GFX9-NEXT:    s_bfe_u32 s8, s5, 0x10010
-; GFX9-NEXT:    s_add_i32 s8, s8, s5
-; GFX9-NEXT:    s_add_i32 s10, s8, 0x7fff
+; GFX9-NEXT:    s_lshr_b32 s91, s6, 16
+; GFX9-NEXT:    s_bfe_u32 s6, s5, 0x10010
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_add_i32 s8, s6, 0x7fff
 ; GFX9-NEXT:    s_bitset1_b32 s5, 22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s10
-; GFX9-NEXT:    s_and_b32 s8, s4, 0xffff0000
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
-; GFX9-NEXT:    s_bfe_u32 s9, s8, 0x10010
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX9-NEXT:    s_and_b32 s6, s4, 0xffff0000
+; GFX9-NEXT:    v_add_f32_e32 v2, s6, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX9-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
 ; GFX9-NEXT:    s_lshr_b32 s5, s5, 16
-; GFX9-NEXT:    s_add_i32 s10, s9, 0x7fff
-; GFX9-NEXT:    s_or_b32 s11, s8, 0x400000
+; GFX9-NEXT:    s_add_i32 s8, s7, 0x7fff
+; GFX9-NEXT:    s_or_b32 s9, s6, 0x400000
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s8, s11, s10
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
-; GFX9-NEXT:    s_bfe_u32 s8, s4, 0x10010
-; GFX9-NEXT:    s_add_i32 s8, s8, s4
-; GFX9-NEXT:    s_add_i32 s11, s8, 0x7fff
+; GFX9-NEXT:    s_lshr_b32 s8, s6, 16
+; GFX9-NEXT:    s_bfe_u32 s6, s4, 0x10010
+; GFX9-NEXT:    s_add_i32 s6, s6, s4
+; GFX9-NEXT:    s_add_i32 s9, s6, 0x7fff
 ; GFX9-NEXT:    s_bitset1_b32 s4, 22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GFX9-NEXT:    s_cselect_b32 s4, s4, s11
+; GFX9-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s9
 ; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s7, s17, s76
-; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s16, s6
-; GFX9-NEXT:    s_pack_ll_b32_b16 s13, s19, s77
-; GFX9-NEXT:    s_pack_ll_b32_b16 s45, s21, s78
+; GFX9-NEXT:    s_pack_ll_b32_b16 s47, s17, s76
+; GFX9-NEXT:    s_pack_ll_b32_b16 s57, s19, s77
+; GFX9-NEXT:    s_pack_ll_b32_b16 s59, s21, s78
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s61, s23, s79
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s63, s25, s88
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s73, s27, s89
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s75, s29, s90
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s31, s5, s91
-; GFX9-NEXT:    s_pack_ll_b32_b16 s30, s4, s10
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[30:31], 24
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[74:75], 24
-; GFX9-NEXT:    s_lshr_b64 s[14:15], s[72:73], 24
-; GFX9-NEXT:    s_lshr_b64 s[40:41], s[62:63], 24
-; GFX9-NEXT:    s_lshr_b64 s[42:43], s[60:61], 24
-; GFX9-NEXT:    s_lshr_b64 s[46:47], s[44:45], 24
-; GFX9-NEXT:    s_lshr_b64 s[56:57], s[12:13], 24
-; GFX9-NEXT:    s_lshr_b64 s[58:59], s[6:7], 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s30, s4, s8
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[30:31], 24
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[74:75], 24
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[72:73], 24
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[62:63], 24
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[60:61], 24
+; GFX9-NEXT:    s_lshr_b64 s[40:41], s[58:59], 24
+; GFX9-NEXT:    s_lshr_b64 s[42:43], s[56:57], 24
+; GFX9-NEXT:    s_lshr_b64 s[44:45], s[46:47], 24
 ; GFX9-NEXT:    s_lshr_b32 s92, s31, 24
 ; GFX9-NEXT:    s_lshr_b32 s93, s31, 8
 ; GFX9-NEXT:    s_lshr_b32 s94, s30, 16
@@ -91608,178 +91661,178 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:    s_lshr_b32 s61, s61, 8
 ; GFX9-NEXT:    s_lshr_b32 s39, s60, 16
 ; GFX9-NEXT:    s_lshr_b32 s60, s60, 8
-; GFX9-NEXT:    s_lshr_b32 s48, s45, 24
-; GFX9-NEXT:    s_lshr_b32 s45, s45, 8
-; GFX9-NEXT:    s_lshr_b32 s49, s44, 16
-; GFX9-NEXT:    s_lshr_b32 s44, s44, 8
-; GFX9-NEXT:    s_lshr_b32 s50, s13, 24
-; GFX9-NEXT:    s_lshr_b32 s13, s13, 8
-; GFX9-NEXT:    s_lshr_b32 s51, s12, 16
-; GFX9-NEXT:    s_lshr_b32 s12, s12, 8
-; GFX9-NEXT:    s_lshr_b32 s52, s7, 24
-; GFX9-NEXT:    s_lshr_b32 s53, s7, 8
-; GFX9-NEXT:    s_lshr_b32 s54, s6, 16
-; GFX9-NEXT:    s_lshr_b32 s55, s6, 8
+; GFX9-NEXT:    s_lshr_b32 s48, s59, 24
+; GFX9-NEXT:    s_lshr_b32 s59, s59, 8
+; GFX9-NEXT:    s_lshr_b32 s49, s58, 16
+; GFX9-NEXT:    s_lshr_b32 s58, s58, 8
+; GFX9-NEXT:    s_lshr_b32 s50, s57, 24
+; GFX9-NEXT:    s_lshr_b32 s57, s57, 8
+; GFX9-NEXT:    s_lshr_b32 s51, s56, 16
+; GFX9-NEXT:    s_lshr_b32 s56, s56, 8
+; GFX9-NEXT:    s_lshr_b32 s52, s47, 24
+; GFX9-NEXT:    s_lshr_b32 s53, s47, 8
+; GFX9-NEXT:    s_lshr_b32 s54, s46, 16
+; GFX9-NEXT:    s_lshr_b32 s55, s46, 8
 ; GFX9-NEXT:  .LBB109_3: ; %end
-; GFX9-NEXT:    s_and_b32 s6, s16, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s55, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s54, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s58, 8
+; GFX9-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s55, 8
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s17, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s53, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s76, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s52, 8
+; GFX9-NEXT:    s_and_b32 s9, s54, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s44, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s17, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s53, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    s_and_b32 s9, s76, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s52, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s18, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s12, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s51, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s18, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s9, s56, 8
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
+; GFX9-NEXT:    s_and_b32 s9, s51, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s42, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s19, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s13, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s77, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s50, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s19, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s57, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    s_and_b32 s9, s77, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s50, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s20, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s44, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s49, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s46, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s58, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    s_and_b32 s9, s49, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s40, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s21, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s45, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s78, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s48, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s21, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s59, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    s_and_b32 s9, s78, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s48, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s22, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s60, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s39, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s42, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s22, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s60, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    s_and_b32 s9, s39, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s14, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s23, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s61, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s79, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s38, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s23, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s61, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    s_and_b32 s9, s79, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s38, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s24, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s62, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s37, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s40, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s24, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s62, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    s_and_b32 s9, s37, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s12, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s25, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s63, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s88, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s36, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s25, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s63, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    s_and_b32 s9, s88, 0xff
+; GFX9-NEXT:    s_lshl_b32 s11, s36, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s72, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s35, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s14, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s26, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s72, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    s_and_b32 s9, s35, 0xff
+; GFX9-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s10
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s27, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s73, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s89, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s34, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s27, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s73, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s9
+; GFX9-NEXT:    s_and_b32 s9, s89, 0xff
+; GFX9-NEXT:    s_lshl_b32 s10, s34, 8
+; GFX9-NEXT:    s_or_b32 s9, s9, s10
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s9, s9, 16
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s28, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s74, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s31, 0xff
-; GFX9-NEXT:    s_lshl_b32 s9, s10, 8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s28, 0xff
+; GFX9-NEXT:    s_lshl_b32 s9, s74, 8
 ; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
+; GFX9-NEXT:    s_and_b32 s9, s31, 0xff
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-NEXT:    s_or_b32 s8, s9, s8
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_and_b32 s6, s29, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s75, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
-; GFX9-NEXT:    s_and_b32 s7, s90, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_and_b32 s7, s29, 0xff
+; GFX9-NEXT:    s_lshl_b32 s8, s75, 8
+; GFX9-NEXT:    s_or_b32 s7, s7, s8
+; GFX9-NEXT:    s_and_b32 s8, s90, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s9, s30, 8
-; GFX9-NEXT:    s_or_b32 s7, s7, s9
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
+; GFX9-NEXT:    s_or_b32 s8, s8, s9
+; GFX9-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX9-NEXT:    s_lshl_b32 s6, s95, 8
-; GFX9-NEXT:    s_or_b32 s4, s4, s6
-; GFX9-NEXT:    s_and_b32 s6, s94, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s8, 8
-; GFX9-NEXT:    s_or_b32 s6, s6, s7
+; GFX9-NEXT:    s_lshl_b32 s7, s95, 8
+; GFX9-NEXT:    s_or_b32 s4, s4, s7
+; GFX9-NEXT:    s_and_b32 s7, s94, 0xff
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s6, s7, s6
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s4, s4, s6
@@ -91821,49 +91874,49 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:  .LBB109_4:
 ; GFX9-NEXT:    ; implicit-def: $sgpr55
 ; GFX9-NEXT:    ; implicit-def: $sgpr54
-; GFX9-NEXT:    ; implicit-def: $sgpr58
+; GFX9-NEXT:    ; implicit-def: $sgpr44
 ; GFX9-NEXT:    ; implicit-def: $sgpr53
 ; GFX9-NEXT:    ; implicit-def: $sgpr76
 ; GFX9-NEXT:    ; implicit-def: $sgpr52
-; GFX9-NEXT:    ; implicit-def: $sgpr12
-; GFX9-NEXT:    ; implicit-def: $sgpr51
 ; GFX9-NEXT:    ; implicit-def: $sgpr56
-; GFX9-NEXT:    ; implicit-def: $sgpr13
+; GFX9-NEXT:    ; implicit-def: $sgpr51
+; GFX9-NEXT:    ; implicit-def: $sgpr42
+; GFX9-NEXT:    ; implicit-def: $sgpr57
 ; GFX9-NEXT:    ; implicit-def: $sgpr77
 ; GFX9-NEXT:    ; implicit-def: $sgpr50
-; GFX9-NEXT:    ; implicit-def: $sgpr44
+; GFX9-NEXT:    ; implicit-def: $sgpr58
 ; GFX9-NEXT:    ; implicit-def: $sgpr49
-; GFX9-NEXT:    ; implicit-def: $sgpr46
-; GFX9-NEXT:    ; implicit-def: $sgpr45
+; GFX9-NEXT:    ; implicit-def: $sgpr40
+; GFX9-NEXT:    ; implicit-def: $sgpr59
 ; GFX9-NEXT:    ; implicit-def: $sgpr78
 ; GFX9-NEXT:    ; implicit-def: $sgpr48
 ; GFX9-NEXT:    ; implicit-def: $sgpr60
 ; GFX9-NEXT:    ; implicit-def: $sgpr39
-; GFX9-NEXT:    ; implicit-def: $sgpr42
+; GFX9-NEXT:    ; implicit-def: $sgpr14
 ; GFX9-NEXT:    ; implicit-def: $sgpr61
 ; GFX9-NEXT:    ; implicit-def: $sgpr79
 ; GFX9-NEXT:    ; implicit-def: $sgpr38
 ; GFX9-NEXT:    ; implicit-def: $sgpr62
 ; GFX9-NEXT:    ; implicit-def: $sgpr37
-; GFX9-NEXT:    ; implicit-def: $sgpr40
+; GFX9-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-NEXT:    ; implicit-def: $sgpr63
 ; GFX9-NEXT:    ; implicit-def: $sgpr88
 ; GFX9-NEXT:    ; implicit-def: $sgpr36
 ; GFX9-NEXT:    ; implicit-def: $sgpr72
 ; GFX9-NEXT:    ; implicit-def: $sgpr35
-; GFX9-NEXT:    ; implicit-def: $sgpr14
+; GFX9-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-NEXT:    ; implicit-def: $sgpr73
 ; GFX9-NEXT:    ; implicit-def: $sgpr89
 ; GFX9-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-NEXT:    ; implicit-def: $sgpr74
 ; GFX9-NEXT:    ; implicit-def: $sgpr31
-; GFX9-NEXT:    ; implicit-def: $sgpr10
+; GFX9-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-NEXT:    ; implicit-def: $sgpr75
 ; GFX9-NEXT:    ; implicit-def: $sgpr90
 ; GFX9-NEXT:    ; implicit-def: $sgpr30
 ; GFX9-NEXT:    ; implicit-def: $sgpr95
 ; GFX9-NEXT:    ; implicit-def: $sgpr94
-; GFX9-NEXT:    ; implicit-def: $sgpr8
+; GFX9-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-NEXT:    ; implicit-def: $sgpr93
 ; GFX9-NEXT:    ; implicit-def: $sgpr91
 ; GFX9-NEXT:    ; implicit-def: $sgpr92
@@ -91877,7 +91930,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v17, s30, 0
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v17, s31, 1
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v17, s34, 2
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v17, s35, 3
@@ -91887,7 +91940,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v17, s39, 7
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v17, s48, 8
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v17, s49, 9
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v17, s50, 10
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB109_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s27, 24
@@ -91929,7 +91981,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s48, s1, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s49, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s50, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s0, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[8:9], s[26:27], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[4:5], s[24:25], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[6:7], s[22:23], 24
@@ -91938,7 +91990,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[14:15], s[16:17], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[28:29], s[2:3], 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[40:41], s[0:1], 24
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB109_3
 ; GFX11-TRUE16-NEXT:  .LBB109_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s1, 0xffff0000
@@ -92381,10 +92433,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[8:9], vcc, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, vcc_lo, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, vcc_lo, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s50, s42, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s42, 8
 ; GFX11-TRUE16-NEXT:  .LBB109_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s50
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s42
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s49
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s40
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
@@ -92600,7 +92652,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s50, v17, 10
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v17, 9
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v17, 8
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v17, 7
@@ -92617,7 +92668,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB109_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr48_lo16
@@ -92675,7 +92726,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s30, 0
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s31, 1
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s34, 2
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s35, 3
@@ -92684,9 +92735,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s38, 6
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s39, 7
 ; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s48, 8
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s49, 9
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s50, 10
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s51, 11
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s50, 9
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v17, s51, 10
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB109_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s27, 24
@@ -92728,7 +92778,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s1, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s0, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[10:11], s[26:27], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[24:25], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[22:23], 24
@@ -92737,7 +92787,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[14:15], s[16:17], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[28:29], s[2:3], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[40:41], s[0:1], 24
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB109_3
 ; GFX11-FAKE16-NEXT:  .LBB109_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s1, 0xffff0000
@@ -93180,10 +93230,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[10:11], s[50:51], 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s50, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s50, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s42, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s42, 8
 ; GFX11-FAKE16-NEXT:  .LBB109_3: ; %end
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s49, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s42, 8
 ; GFX11-FAKE16-NEXT:    s_and_b32 s7, s48, 0xff
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s40, 8
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s5
@@ -93339,9 +93389,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s51, v17, 11
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s50, v17, 10
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v17, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s51, v17, 10
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s50, v17, 9
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v17, 8
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v17, 7
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v17, 6
@@ -93357,7 +93406,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-FAKE16-NEXT:  .LBB109_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr48
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr40
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr39
@@ -95935,12 +95984,12 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_readfirstlane_b32 s46, v30
-; SI-NEXT:    v_readfirstlane_b32 s41, v23
-; SI-NEXT:    v_readfirstlane_b32 s43, v22
-; SI-NEXT:    v_readfirstlane_b32 s10, v15
-; SI-NEXT:    v_readfirstlane_b32 s12, v14
-; SI-NEXT:    v_readfirstlane_b32 s8, v7
-; SI-NEXT:    v_readfirstlane_b32 s9, v6
+; SI-NEXT:    v_readfirstlane_b32 s44, v23
+; SI-NEXT:    v_readfirstlane_b32 s45, v22
+; SI-NEXT:    v_readfirstlane_b32 s41, v15
+; SI-NEXT:    v_readfirstlane_b32 s43, v14
+; SI-NEXT:    v_readfirstlane_b32 s10, v7
+; SI-NEXT:    v_readfirstlane_b32 s12, v6
 ; SI-NEXT:    v_readfirstlane_b32 s7, v1
 ; SI-NEXT:    v_readfirstlane_b32 s6, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v5
@@ -95973,47 +96022,47 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    s_and_b32 s4, s16, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s17, 24
-; SI-NEXT:    s_or_b32 s11, s5, s4
+; SI-NEXT:    s_or_b32 s8, s5, s4
 ; SI-NEXT:    s_and_b32 s4, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s19, 24
-; SI-NEXT:    s_or_b32 s13, s5, s4
+; SI-NEXT:    s_or_b32 s9, s5, s4
 ; SI-NEXT:    s_and_b32 s4, s20, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s21, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_lshl_b32 s14, s4, 16
+; SI-NEXT:    s_lshl_b32 s11, s4, 16
 ; SI-NEXT:    s_and_b32 s4, s22, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s23, 24
-; SI-NEXT:    s_or_b32 s15, s5, s4
+; SI-NEXT:    s_or_b32 s13, s5, s4
 ; SI-NEXT:    s_and_b32 s4, s24, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s25, 24
-; SI-NEXT:    s_or_b32 s40, s5, s4
+; SI-NEXT:    s_or_b32 s14, s5, s4
 ; SI-NEXT:    s_and_b32 s4, s26, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s27, 24
-; SI-NEXT:    s_or_b32 s42, s5, s4
+; SI-NEXT:    s_or_b32 s15, s5, s4
 ; SI-NEXT:    s_and_b32 s4, s28, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s29, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_lshl_b32 s44, s4, 16
+; SI-NEXT:    s_lshl_b32 s40, s4, 16
 ; SI-NEXT:    s_and_b32 s4, s6, 0xff
 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s5, s7, 24
-; SI-NEXT:    s_or_b32 s45, s5, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s8, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_lshl_b32 s60, s4, 16
+; SI-NEXT:    s_or_b32 s42, s5, s4
 ; SI-NEXT:    s_and_b32 s4, s12, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s10, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_lshl_b32 s60, s4, 16
+; SI-NEXT:    s_and_b32 s4, s43, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s41, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v2
 ; SI-NEXT:    s_lshl_b32 s61, s4, 16
 ; SI-NEXT:    v_and_b32_e32 v17, 0xff, v18
-; SI-NEXT:    s_and_b32 s4, s43, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s41, 8
+; SI-NEXT:    s_and_b32 s4, s45, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s44, 8
 ; SI-NEXT:    v_and_b32_e32 v25, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v3
@@ -96161,10 +96210,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    v_or_b32_e32 v13, v22, v13
 ; SI-NEXT:    v_or_b32_e32 v9, v13, v9
-; SI-NEXT:    s_add_i32 s43, s43, 3
+; SI-NEXT:    s_add_i32 s45, s45, 3
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 0x3000000, v9
-; SI-NEXT:    s_and_b32 s4, s43, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s41, 8
+; SI-NEXT:    s_and_b32 s4, s45, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s44, 8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v24
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
@@ -96184,10 +96233,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v13
-; SI-NEXT:    s_add_i32 s12, s12, 3
+; SI-NEXT:    s_add_i32 s43, s43, 3
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v9
-; SI-NEXT:    s_and_b32 s4, s12, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s10, 8
+; SI-NEXT:    s_and_b32 s4, s43, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s41, 8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
@@ -96195,10 +96244,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v9
-; SI-NEXT:    s_add_i32 s9, s9, 3
+; SI-NEXT:    s_add_i32 s12, s12, 3
 ; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s8, 8
+; SI-NEXT:    s_and_b32 s4, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s10, 8
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
@@ -96286,14 +96335,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
 ; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
 ; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
-; SI-NEXT:    s_and_b32 s13, s7, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s11, s7, 16
-; SI-NEXT:    s_and_b32 s15, s6, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s14, s6, 16
-; SI-NEXT:    s_and_b32 s42, s5, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s40, s5, 16
-; SI-NEXT:    s_and_b32 s45, s4, 0xffff0000
-; SI-NEXT:    s_lshl_b32 s44, s4, 16
+; SI-NEXT:    s_and_b32 s9, s7, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s8, s7, 16
+; SI-NEXT:    s_and_b32 s13, s6, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s11, s6, 16
+; SI-NEXT:    s_and_b32 s15, s5, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s14, s5, 16
+; SI-NEXT:    s_and_b32 s42, s4, 0xffff0000
+; SI-NEXT:    s_lshl_b32 s40, s4, 16
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v0
 ; SI-NEXT:    v_and_b32_e32 v38, 0xffff0000, v1
@@ -96320,14 +96369,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v43
 ; SI-NEXT:    s_branch .LBB111_5
 ; SI-NEXT:  .LBB111_3:
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr15
 ; SI-NEXT:    ; implicit-def: $sgpr40
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $sgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr60
@@ -96367,14 +96416,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_mov_b32_e32 v1, s13
-; SI-NEXT:    v_mov_b32_e32 v2, s14
-; SI-NEXT:    v_mov_b32_e32 v3, s15
-; SI-NEXT:    v_mov_b32_e32 v4, s40
-; SI-NEXT:    v_mov_b32_e32 v5, s42
-; SI-NEXT:    v_mov_b32_e32 v6, s44
-; SI-NEXT:    v_mov_b32_e32 v7, s45
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    v_mov_b32_e32 v3, s13
+; SI-NEXT:    v_mov_b32_e32 v4, s14
+; SI-NEXT:    v_mov_b32_e32 v5, s15
+; SI-NEXT:    v_mov_b32_e32 v6, s40
+; SI-NEXT:    v_mov_b32_e32 v7, s42
 ; SI-NEXT:    v_mov_b32_e32 v8, v37
 ; SI-NEXT:    v_mov_b32_e32 v11, v38
 ; SI-NEXT:    v_mov_b32_e32 v12, v48
@@ -96466,11 +96515,11 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v39
+; VI-NEXT:    v_lshlrev_b32_e32 v38, 8, v39
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v51, 8, v48
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b32_e32 v38, 8, v49
+; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v49
 ; VI-NEXT:    s_cbranch_scc0 .LBB111_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -96529,10 +96578,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v45, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v57, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
@@ -96592,9 +96641,9 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v24
 ; VI-NEXT:    v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v23
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v45
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v57
 ; VI-NEXT:    v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v22, vcc, 0x300, v3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v55
 ; VI-NEXT:    v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -96651,8 +96700,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; VI-NEXT:    s_addk_i32 s6, 0x300
 ; VI-NEXT:    s_addk_i32 s8, 0x300
 ; VI-NEXT:    s_addk_i32 s10, 0x300
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v57
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v45
 ; VI-NEXT:    s_addk_i32 s4, 0x300
 ; VI-NEXT:    s_lshl_b32 s5, s5, 16
 ; VI-NEXT:    s_lshl_b32 s7, s7, 16
@@ -96660,8 +96709,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; VI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; VI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; VI-NEXT:    s_and_b32 s6, s6, 0xffff
-; VI-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x300, v1
+; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    s_or_b32 s9, s9, s10
 ; VI-NEXT:    s_or_b32 s7, s7, s8
 ; VI-NEXT:    s_or_b32 s5, s5, s6

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 70a4fdce8d9a0..3a26a5c263d78 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -3264,7 +3264,7 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -3283,10 +3283,10 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s43
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB13_3
 ; GFX11-TRUE16-NEXT:  .LBB13_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s29, s29, 3
@@ -3322,9 +3322,9 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB13_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s18, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s19, s14
@@ -3338,18 +3338,16 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s25
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, s46
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s26
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s27
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s43
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s19, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s43
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s42
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s41
@@ -3367,9 +3365,9 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB13_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40_lo16
@@ -6341,7 +6339,7 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB17_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -6360,10 +6358,10 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s43
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB17_3
 ; GFX11-TRUE16-NEXT:  .LBB17_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s29, s29, 3
@@ -6399,9 +6397,9 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB17_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s18, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s19, s14
@@ -6415,18 +6413,16 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s25
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, s46
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s26
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s27
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s43
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s19, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s43
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s42
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s41
@@ -6444,9 +6440,9 @@ define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB17_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40_lo16
@@ -17774,7 +17770,7 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB41_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -17793,10 +17789,10 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s43
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB41_3
 ; GFX11-TRUE16-NEXT:  .LBB41_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s28, s28, 3
@@ -17832,9 +17828,9 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB41_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s18, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s19, s14
@@ -17848,18 +17844,16 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s25
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, s46
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s26
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s27
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s43
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s19, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s43
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s42
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s41
@@ -17877,9 +17871,9 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB41_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40_lo16
@@ -20861,7 +20855,7 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB45_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -20880,10 +20874,10 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s43
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB45_3
 ; GFX11-TRUE16-NEXT:  .LBB45_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s28, s28, 3
@@ -20919,9 +20913,9 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB45_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s18, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s19, s14
@@ -20935,18 +20929,16 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s25
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, s46
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s26
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s27
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s43
-; GFX11-TRUE16-NEXT:    s_mov_b32 s43, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s19, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s43
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s42
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s41
@@ -20964,9 +20956,9 @@ define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB45_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40_lo16

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 9d2b0df4280e7..cc55ba1d84df6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -3400,7 +3400,7 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s58, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -3421,10 +3421,10 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s47
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s58
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB13_3
 ; GFX11-TRUE16-NEXT:  .LBB13_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -3464,26 +3464,24 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB13_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s22, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s25
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, s58
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s26
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s27
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s47
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s23, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s57
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s57
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s47
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s45
@@ -3508,9 +3506,9 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB13_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr47_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44_lo16
@@ -6776,7 +6774,7 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s58, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB17_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -6797,10 +6795,10 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s47
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s58
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB17_3
 ; GFX11-TRUE16-NEXT:  .LBB17_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -6840,26 +6838,24 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB17_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s22, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s25
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, s58
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s26
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s27
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s47
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s23, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s57
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s57
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s47
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s45
@@ -6884,9 +6880,9 @@ define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB17_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr47_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44_lo16
@@ -8029,7 +8025,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v8
@@ -8049,26 +8045,26 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, s28
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v61
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v59
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v38
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v36
@@ -8077,7 +8073,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v54
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v52
@@ -8087,12 +8083,12 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; SI-NEXT:    v_or_b32_e32 v0, v57, v0
-; SI-NEXT:    v_or_b32_e32 v1, v58, v1
-; SI-NEXT:    v_or_b32_e32 v2, v42, v2
+; SI-NEXT:    v_or_b32_e32 v0, v60, v0
+; SI-NEXT:    v_or_b32_e32 v1, v57, v1
+; SI-NEXT:    v_or_b32_e32 v2, v56, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v37, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v35, v4
-; SI-NEXT:    v_or_b32_e32 v5, v33, v5
+; SI-NEXT:    v_or_b32_e32 v5, v61, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v24, v6
 ; SI-NEXT:    v_or_b32_e32 v7, v47, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v62, v8
@@ -8109,12 +8105,11 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v19, v21, v19
 ; SI-NEXT:    s_cbranch_execnz .LBB19_3
 ; SI-NEXT:  .LBB19_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v61
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v60
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v57
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v58
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v57
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
@@ -8128,8 +8123,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v39
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v35
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -8143,7 +8137,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v38
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v61
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v24
@@ -8170,7 +8164,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v25
@@ -8294,20 +8288,20 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB19_4:
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v42, v48
+; SI-NEXT:    v_mov_b32_e32 v59, v48
 ; SI-NEXT:    v_mov_b32_e32 v48, v21
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v58, v49
+; SI-NEXT:    v_mov_b32_e32 v60, v49
 ; SI-NEXT:    v_mov_b32_e32 v49, v20
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v43, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v22
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v44, v51
 ; SI-NEXT:    v_mov_b32_e32 v51, v23
@@ -8315,21 +8309,22 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v52, v27
 ; SI-NEXT:    v_mov_b32_e32 v46, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v28
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v60, v33
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v41, v32
 ; SI-NEXT:    v_mov_b32_e32 v33, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v29
+; SI-NEXT:    v_mov_b32_e32 v42, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v30
-; SI-NEXT:    v_mov_b32_e32 v59, v57
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v36, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v31
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v37
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v39, v58
+; SI-NEXT:    v_mov_b32_e32 v58, v37
 ; SI-NEXT:    v_mov_b32_e32 v37, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v24
 ; SI-NEXT:    v_mov_b32_e32 v32, v38
@@ -8341,34 +8336,34 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v38, v32
 ; SI-NEXT:    v_mov_b32_e32 v24, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v36
+; SI-NEXT:    v_mov_b32_e32 v37, v58
+; SI-NEXT:    v_mov_b32_e32 v58, v39
+; SI-NEXT:    v_mov_b32_e32 v31, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v36
 ; SI-NEXT:    v_mov_b32_e32 v30, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
-; SI-NEXT:    v_mov_b32_e32 v29, v54
-; SI-NEXT:    v_mov_b32_e32 v54, v47
-; SI-NEXT:    v_mov_b32_e32 v47, v33
+; SI-NEXT:    v_mov_b32_e32 v56, v42
 ; SI-NEXT:    v_mov_b32_e32 v32, v41
-; SI-NEXT:    v_mov_b32_e32 v33, v60
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v23, v51
 ; SI-NEXT:    v_mov_b32_e32 v51, v44
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v22, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v43
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v20, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v58
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v49, v60
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v21, v48
-; SI-NEXT:    v_mov_b32_e32 v48, v42
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v48, v59
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v31, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v57
-; SI-NEXT:    v_mov_b32_e32 v57, v59
+; SI-NEXT:    v_mov_b32_e32 v29, v54
+; SI-NEXT:    v_mov_b32_e32 v54, v47
+; SI-NEXT:    v_mov_b32_e32 v47, v33
 ; SI-NEXT:    v_mov_b32_e32 v28, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v46
 ; SI-NEXT:    v_mov_b32_e32 v27, v52
@@ -16117,7 +16112,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v8
@@ -16137,26 +16132,26 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, s28
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v61
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v59
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v38
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v36
@@ -16165,7 +16160,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v54
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v52
@@ -16175,12 +16170,12 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; SI-NEXT:    v_or_b32_e32 v0, v57, v0
-; SI-NEXT:    v_or_b32_e32 v1, v58, v1
-; SI-NEXT:    v_or_b32_e32 v2, v42, v2
+; SI-NEXT:    v_or_b32_e32 v0, v60, v0
+; SI-NEXT:    v_or_b32_e32 v1, v57, v1
+; SI-NEXT:    v_or_b32_e32 v2, v56, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v37, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v35, v4
-; SI-NEXT:    v_or_b32_e32 v5, v33, v5
+; SI-NEXT:    v_or_b32_e32 v5, v61, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v24, v6
 ; SI-NEXT:    v_or_b32_e32 v7, v47, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v62, v8
@@ -16197,12 +16192,11 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v19, v21, v19
 ; SI-NEXT:    s_cbranch_execnz .LBB35_3
 ; SI-NEXT:  .LBB35_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v61
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v60
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v57
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v58
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v57
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
@@ -16216,8 +16210,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v39
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v35
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -16231,7 +16224,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v38
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v61
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v24
@@ -16258,7 +16251,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v25
@@ -16382,20 +16375,20 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB35_4:
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v42, v48
+; SI-NEXT:    v_mov_b32_e32 v59, v48
 ; SI-NEXT:    v_mov_b32_e32 v48, v21
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v58, v49
+; SI-NEXT:    v_mov_b32_e32 v60, v49
 ; SI-NEXT:    v_mov_b32_e32 v49, v20
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v43, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v22
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v44, v51
 ; SI-NEXT:    v_mov_b32_e32 v51, v23
@@ -16403,21 +16396,22 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v52, v27
 ; SI-NEXT:    v_mov_b32_e32 v46, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v28
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v60, v33
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v41, v32
 ; SI-NEXT:    v_mov_b32_e32 v33, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v29
+; SI-NEXT:    v_mov_b32_e32 v42, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v30
-; SI-NEXT:    v_mov_b32_e32 v59, v57
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v36, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v31
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v37
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v39, v58
+; SI-NEXT:    v_mov_b32_e32 v58, v37
 ; SI-NEXT:    v_mov_b32_e32 v37, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v24
 ; SI-NEXT:    v_mov_b32_e32 v32, v38
@@ -16429,34 +16423,34 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v38, v32
 ; SI-NEXT:    v_mov_b32_e32 v24, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v36
+; SI-NEXT:    v_mov_b32_e32 v37, v58
+; SI-NEXT:    v_mov_b32_e32 v58, v39
+; SI-NEXT:    v_mov_b32_e32 v31, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v36
 ; SI-NEXT:    v_mov_b32_e32 v30, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
-; SI-NEXT:    v_mov_b32_e32 v29, v54
-; SI-NEXT:    v_mov_b32_e32 v54, v47
-; SI-NEXT:    v_mov_b32_e32 v47, v33
+; SI-NEXT:    v_mov_b32_e32 v56, v42
 ; SI-NEXT:    v_mov_b32_e32 v32, v41
-; SI-NEXT:    v_mov_b32_e32 v33, v60
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v23, v51
 ; SI-NEXT:    v_mov_b32_e32 v51, v44
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v22, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v43
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v20, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v58
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v49, v60
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v21, v48
-; SI-NEXT:    v_mov_b32_e32 v48, v42
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v48, v59
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v31, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v57
-; SI-NEXT:    v_mov_b32_e32 v57, v59
+; SI-NEXT:    v_mov_b32_e32 v29, v54
+; SI-NEXT:    v_mov_b32_e32 v54, v47
+; SI-NEXT:    v_mov_b32_e32 v47, v33
 ; SI-NEXT:    v_mov_b32_e32 v28, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v46
 ; SI-NEXT:    v_mov_b32_e32 v27, v52
@@ -18916,7 +18910,7 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s58, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB41_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -18937,10 +18931,10 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s47
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s58
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB41_3
 ; GFX11-TRUE16-NEXT:  .LBB41_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s5, s5, 3
@@ -18980,26 +18974,24 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB41_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s22, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s25
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, s58
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s26
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s27
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s47
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s23, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s57
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s57
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s47
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s45
@@ -19024,9 +19016,9 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB41_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr47_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44_lo16
@@ -22302,7 +22294,7 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s58, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB45_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -22323,10 +22315,10 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s47
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s58
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB45_3
 ; GFX11-TRUE16-NEXT:  .LBB45_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s5, s5, 3
@@ -22366,26 +22358,24 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s16, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB45_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s22, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s25
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, s58
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s26
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s27
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s47
-; GFX11-TRUE16-NEXT:    s_mov_b32 s47, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s23, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s57
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s57
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s47
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s45
@@ -22410,9 +22400,9 @@ define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB45_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr47_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44_lo16
@@ -23555,7 +23545,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v8
@@ -23575,26 +23565,26 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, s28
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v61
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v59
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v38
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v36
@@ -23603,7 +23593,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v54
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v52
@@ -23613,12 +23603,12 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; SI-NEXT:    v_or_b32_e32 v0, v57, v0
-; SI-NEXT:    v_or_b32_e32 v1, v58, v1
-; SI-NEXT:    v_or_b32_e32 v2, v42, v2
+; SI-NEXT:    v_or_b32_e32 v0, v60, v0
+; SI-NEXT:    v_or_b32_e32 v1, v57, v1
+; SI-NEXT:    v_or_b32_e32 v2, v56, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v37, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v35, v4
-; SI-NEXT:    v_or_b32_e32 v5, v33, v5
+; SI-NEXT:    v_or_b32_e32 v5, v61, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v24, v6
 ; SI-NEXT:    v_or_b32_e32 v7, v47, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v62, v8
@@ -23635,12 +23625,11 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v19, v21, v19
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v61
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v60
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v57
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v58
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v57
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
@@ -23654,8 +23643,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v39
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v35
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -23669,7 +23657,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v38
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v61
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v24
@@ -23696,7 +23684,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v25
@@ -23820,20 +23808,20 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB47_4:
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v42, v48
+; SI-NEXT:    v_mov_b32_e32 v59, v48
 ; SI-NEXT:    v_mov_b32_e32 v48, v21
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v58, v49
+; SI-NEXT:    v_mov_b32_e32 v60, v49
 ; SI-NEXT:    v_mov_b32_e32 v49, v20
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v43, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v22
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v44, v51
 ; SI-NEXT:    v_mov_b32_e32 v51, v23
@@ -23841,21 +23829,22 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v52, v27
 ; SI-NEXT:    v_mov_b32_e32 v46, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v28
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v60, v33
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v41, v32
 ; SI-NEXT:    v_mov_b32_e32 v33, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v29
+; SI-NEXT:    v_mov_b32_e32 v42, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v30
-; SI-NEXT:    v_mov_b32_e32 v59, v57
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v36, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v31
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v37
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v39, v58
+; SI-NEXT:    v_mov_b32_e32 v58, v37
 ; SI-NEXT:    v_mov_b32_e32 v37, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v24
 ; SI-NEXT:    v_mov_b32_e32 v32, v38
@@ -23867,34 +23856,34 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v38, v32
 ; SI-NEXT:    v_mov_b32_e32 v24, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v36
+; SI-NEXT:    v_mov_b32_e32 v37, v58
+; SI-NEXT:    v_mov_b32_e32 v58, v39
+; SI-NEXT:    v_mov_b32_e32 v31, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v36
 ; SI-NEXT:    v_mov_b32_e32 v30, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
-; SI-NEXT:    v_mov_b32_e32 v29, v54
-; SI-NEXT:    v_mov_b32_e32 v54, v47
-; SI-NEXT:    v_mov_b32_e32 v47, v33
+; SI-NEXT:    v_mov_b32_e32 v56, v42
 ; SI-NEXT:    v_mov_b32_e32 v32, v41
-; SI-NEXT:    v_mov_b32_e32 v33, v60
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v23, v51
 ; SI-NEXT:    v_mov_b32_e32 v51, v44
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v22, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v43
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v20, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v58
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v49, v60
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v21, v48
-; SI-NEXT:    v_mov_b32_e32 v48, v42
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v48, v59
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v31, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v57
-; SI-NEXT:    v_mov_b32_e32 v57, v59
+; SI-NEXT:    v_mov_b32_e32 v29, v54
+; SI-NEXT:    v_mov_b32_e32 v54, v47
+; SI-NEXT:    v_mov_b32_e32 v47, v33
 ; SI-NEXT:    v_mov_b32_e32 v28, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v46
 ; SI-NEXT:    v_mov_b32_e32 v27, v52
@@ -30104,7 +30093,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v8
@@ -30124,26 +30113,26 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, s28
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v61
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v59
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v38
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v36
@@ -30152,7 +30141,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v54
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v52
@@ -30162,12 +30151,12 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; SI-NEXT:    v_or_b32_e32 v0, v57, v0
-; SI-NEXT:    v_or_b32_e32 v1, v58, v1
-; SI-NEXT:    v_or_b32_e32 v2, v42, v2
+; SI-NEXT:    v_or_b32_e32 v0, v60, v0
+; SI-NEXT:    v_or_b32_e32 v1, v57, v1
+; SI-NEXT:    v_or_b32_e32 v2, v56, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v37, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v35, v4
-; SI-NEXT:    v_or_b32_e32 v5, v33, v5
+; SI-NEXT:    v_or_b32_e32 v5, v61, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v24, v6
 ; SI-NEXT:    v_or_b32_e32 v7, v47, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v62, v8
@@ -30184,12 +30173,11 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v19, v21, v19
 ; SI-NEXT:    s_cbranch_execnz .LBB55_3
 ; SI-NEXT:  .LBB55_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v61
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v60
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v57
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v58
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v57
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
@@ -30203,8 +30191,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v39
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v35
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -30218,7 +30205,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v38
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v61
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v24
@@ -30245,7 +30232,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v25
@@ -30369,20 +30356,20 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB55_4:
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v42, v48
+; SI-NEXT:    v_mov_b32_e32 v59, v48
 ; SI-NEXT:    v_mov_b32_e32 v48, v21
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v58, v49
+; SI-NEXT:    v_mov_b32_e32 v60, v49
 ; SI-NEXT:    v_mov_b32_e32 v49, v20
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v43, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v22
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v44, v51
 ; SI-NEXT:    v_mov_b32_e32 v51, v23
@@ -30390,21 +30377,22 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v52, v27
 ; SI-NEXT:    v_mov_b32_e32 v46, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v28
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v60, v33
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v41, v32
 ; SI-NEXT:    v_mov_b32_e32 v33, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v29
+; SI-NEXT:    v_mov_b32_e32 v42, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v30
-; SI-NEXT:    v_mov_b32_e32 v59, v57
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v36, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v31
-; SI-NEXT:    v_mov_b32_e32 v36, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v37
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v39, v58
+; SI-NEXT:    v_mov_b32_e32 v58, v37
 ; SI-NEXT:    v_mov_b32_e32 v37, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v24
 ; SI-NEXT:    v_mov_b32_e32 v32, v38
@@ -30416,34 +30404,34 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v38, v32
 ; SI-NEXT:    v_mov_b32_e32 v24, v34
 ; SI-NEXT:    v_mov_b32_e32 v34, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v36
+; SI-NEXT:    v_mov_b32_e32 v37, v58
+; SI-NEXT:    v_mov_b32_e32 v58, v39
+; SI-NEXT:    v_mov_b32_e32 v31, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v36
 ; SI-NEXT:    v_mov_b32_e32 v30, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
-; SI-NEXT:    v_mov_b32_e32 v29, v54
-; SI-NEXT:    v_mov_b32_e32 v54, v47
-; SI-NEXT:    v_mov_b32_e32 v47, v33
+; SI-NEXT:    v_mov_b32_e32 v56, v42
 ; SI-NEXT:    v_mov_b32_e32 v32, v41
-; SI-NEXT:    v_mov_b32_e32 v33, v60
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v23, v51
 ; SI-NEXT:    v_mov_b32_e32 v51, v44
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v22, v50
 ; SI-NEXT:    v_mov_b32_e32 v50, v43
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v20, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v58
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v49, v60
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v21, v48
-; SI-NEXT:    v_mov_b32_e32 v48, v42
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v48, v59
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v31, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v57
-; SI-NEXT:    v_mov_b32_e32 v57, v59
+; SI-NEXT:    v_mov_b32_e32 v29, v54
+; SI-NEXT:    v_mov_b32_e32 v54, v47
+; SI-NEXT:    v_mov_b32_e32 v47, v33
 ; SI-NEXT:    v_mov_b32_e32 v28, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v46
 ; SI-NEXT:    v_mov_b32_e32 v27, v52

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 50b13db0e15d4..b73870977c429 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -2889,7 +2889,7 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB25_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[2:3], s[0:1], 24
@@ -2897,8 +2897,8 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s0, 8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s7
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s0, 8
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB25_3
 ; GFX11-TRUE16-NEXT:  .LBB25_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, 3
@@ -2908,10 +2908,10 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 8
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s0, 8
 ; GFX11-TRUE16-NEXT:  .LBB25_3: ; %end
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s6
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, s2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
@@ -2920,7 +2920,7 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, s3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB25_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr2_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr5_lo16
@@ -2932,7 +2932,7 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, 0
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB25_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[0:1], 24
@@ -2940,8 +2940,8 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s0, 8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s7
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s0, 8
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB25_3
 ; GFX11-FAKE16-NEXT:  .LBB25_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, 3
@@ -2951,16 +2951,16 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s1, 8
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s0, 8
 ; GFX11-FAKE16-NEXT:  .LBB25_3: ; %end
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s7
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s5
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s3
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-FAKE16-NEXT:  .LBB25_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr7
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr2
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr5
@@ -9107,7 +9107,7 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB69_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    s_lshr_b64 s[2:3], s[0:1], 24
@@ -9115,8 +9115,8 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s0, 8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s7
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s0, 8
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB69_3
 ; GFX11-TRUE16-NEXT:  .LBB69_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s1, s1, 3
@@ -9126,10 +9126,10 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s0, 8
 ; GFX11-TRUE16-NEXT:  .LBB69_3: ; %end
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s6
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, s2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
@@ -9138,7 +9138,7 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, s3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB69_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr2_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr5_lo16
@@ -9150,7 +9150,7 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, 0
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB69_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[0:1], 24
@@ -9158,8 +9158,8 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s0, 8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s7
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s0, 8
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB69_3
 ; GFX11-FAKE16-NEXT:  .LBB69_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    s_add_i32 s1, s1, 3
@@ -9169,16 +9169,16 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s0, 8
 ; GFX11-FAKE16-NEXT:  .LBB69_3: ; %end
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s7
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s5
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s3
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-FAKE16-NEXT:  .LBB69_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr7
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr2
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr5
@@ -13747,45 +13747,47 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) {
 ; VI-LABEL: bitcast_v4i16_to_v8i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v9, v1
-; VI-NEXT:    v_mov_b32_e32 v8, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-NEXT:    ; implicit-def: $vgpr9
+; VI-NEXT:    ; implicit-def: $vgpr4
 ; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    ; implicit-def: $vgpr8
 ; VI-NEXT:    ; implicit-def: $vgpr5
 ; VI-NEXT:    ; implicit-def: $vgpr7
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; VI-NEXT:    v_mov_b32_e32 v9, v0
+; VI-NEXT:    v_mov_b32_e32 v8, v1
+; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; VI-NEXT:  ; %bb.2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB96_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v0, 3
-; VI-NEXT:    v_add_u16_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v10, 3, v9
+; VI-NEXT:    v_mov_b32_e32 v2, 3
+; VI-NEXT:    v_add_u16_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_e32 v8, 3, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; VI-NEXT:    v_add_u16_e32 v9, 3, v8
+; VI-NEXT:    v_add_u16_e32 v9, 3, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; VI-NEXT:    v_or_b32_e32 v1, v10, v1
+; VI-NEXT:    v_or_b32_e32 v1, v8, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v9, v0
 ; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; VI-NEXT:    v_bfe_u32 v7, v6, 8, 8
-; VI-NEXT:    v_mov_b32_e32 v8, v9
-; VI-NEXT:    v_mov_b32_e32 v9, v10
 ; VI-NEXT:  .LBB96_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v0, v8
-; VI-NEXT:    v_mov_b32_e32 v4, v9
+; VI-NEXT:    v_mov_b32_e32 v0, v9
+; VI-NEXT:    v_mov_b32_e32 v1, v4
+; VI-NEXT:    v_mov_b32_e32 v4, v8
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v4i16_to_v8i8:
@@ -13982,47 +13984,48 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre
 ; VI-NEXT:    s_cbranch_scc0 .LBB97_4
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
-; VI-NEXT:    s_lshr_b32 s9, s17, 24
+; VI-NEXT:    s_lshr_b32 s10, s17, 24
 ; VI-NEXT:    s_lshr_b32 s8, s17, 16
 ; VI-NEXT:    s_lshr_b32 s5, s17, 8
-; VI-NEXT:    s_lshr_b32 s10, s16, 16
-; VI-NEXT:    s_lshr_b32 s11, s16, 8
+; VI-NEXT:    s_lshr_b32 s11, s16, 16
+; VI-NEXT:    s_lshr_b32 s12, s16, 8
+; VI-NEXT:    s_mov_b32 s9, s17
 ; VI-NEXT:    s_cbranch_execnz .LBB97_3
 ; VI-NEXT:  .LBB97_2: ; %cmp.true
-; VI-NEXT:    s_lshr_b32 s6, s17, 16
-; VI-NEXT:    s_add_i32 s4, s17, 3
-; VI-NEXT:    s_add_i32 s8, s6, 3
-; VI-NEXT:    s_and_b32 s5, s4, 0xffff
-; VI-NEXT:    s_lshl_b32 s6, s8, 16
-; VI-NEXT:    s_or_b32 s7, s5, s6
-; VI-NEXT:    s_and_b32 s5, s16, 0xffff0000
+; VI-NEXT:    s_lshr_b32 s5, s17, 16
+; VI-NEXT:    s_add_i32 s9, s17, 3
+; VI-NEXT:    s_add_i32 s8, s5, 3
+; VI-NEXT:    s_and_b32 s4, s9, 0xffff
+; VI-NEXT:    s_lshl_b32 s5, s8, 16
+; VI-NEXT:    s_or_b32 s7, s4, s5
+; VI-NEXT:    s_and_b32 s4, s16, 0xffff0000
 ; VI-NEXT:    s_add_i32 s16, s16, 3
-; VI-NEXT:    s_and_b32 s6, s16, 0xffff
-; VI-NEXT:    s_or_b32 s5, s5, s6
-; VI-NEXT:    s_add_i32 s6, s5, 0x30000
-; VI-NEXT:    s_mov_b32 s17, s4
+; VI-NEXT:    s_and_b32 s5, s16, 0xffff
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_add_i32 s6, s4, 0x30000
 ; VI-NEXT:    s_lshr_b64 s[4:5], s[6:7], 24
 ; VI-NEXT:    s_lshr_b32 s5, s7, 8
-; VI-NEXT:    s_lshr_b32 s10, s6, 16
-; VI-NEXT:    s_lshr_b32 s11, s6, 8
-; VI-NEXT:    s_bfe_u32 s9, s8, 0x80008
+; VI-NEXT:    s_lshr_b32 s11, s6, 16
+; VI-NEXT:    s_lshr_b32 s12, s6, 8
+; VI-NEXT:    s_bfe_u32 s10, s8, 0x80008
 ; VI-NEXT:  .LBB97_3: ; %end
 ; VI-NEXT:    v_mov_b32_e32 v0, s16
-; VI-NEXT:    v_mov_b32_e32 v1, s11
-; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v1, s12
+; VI-NEXT:    v_mov_b32_e32 v2, s11
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
-; VI-NEXT:    v_mov_b32_e32 v4, s17
+; VI-NEXT:    v_mov_b32_e32 v4, s9
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    v_mov_b32_e32 v6, s8
-; VI-NEXT:    v_mov_b32_e32 v7, s9
+; VI-NEXT:    v_mov_b32_e32 v7, s10
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB97_4:
+; VI-NEXT:    ; implicit-def: $sgpr12
 ; VI-NEXT:    ; implicit-def: $sgpr11
-; VI-NEXT:    ; implicit-def: $sgpr10
 ; VI-NEXT:    ; implicit-def: $sgpr4
+; VI-NEXT:    ; implicit-def: $sgpr9
 ; VI-NEXT:    ; implicit-def: $sgpr5
 ; VI-NEXT:    ; implicit-def: $sgpr8
-; VI-NEXT:    ; implicit-def: $sgpr9
+; VI-NEXT:    ; implicit-def: $sgpr10
 ; VI-NEXT:    s_branch .LBB97_2
 ;
 ; GFX9-LABEL: bitcast_v4i16_to_v8i8_scalar:
@@ -16964,78 +16967,77 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v8i8:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
-; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB108_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[8:9], 24, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[10:11], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v9.h
 ; GFX11-TRUE16-NEXT:  .LBB108_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB108_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v4, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v7, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v0 :: v_dual_lshlrev_b32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v11, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v4, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v3, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v12, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v2, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[8:9], 24, v[10:11]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v3, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[10:11], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
 ; GFX11-TRUE16-NEXT:  .LBB108_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 51ed72a3a16fe..2a96722ccce0b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -3642,7 +3642,7 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s62, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s8, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -3664,11 +3664,11 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s58
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s62
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB13_3
 ; GFX11-TRUE16-NEXT:  .LBB13_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -3711,22 +3711,19 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB13_3: ; %end
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, s62
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s58
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, s60
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s27, s14
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s26, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s27, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s61
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s58
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s59
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s61
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s60
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s59
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s58
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s57
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s47
@@ -3756,10 +3753,10 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB13_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr61_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr59_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr47_lo16
@@ -7386,7 +7383,7 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s62, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s8, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB17_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -7408,11 +7405,11 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s58
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s62
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB17_3
 ; GFX11-TRUE16-NEXT:  .LBB17_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -7455,22 +7452,19 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB17_3: ; %end
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, s62
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s58
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, s60
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s27, s14
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s26, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s27, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s61
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s58
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s59
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s61
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s60
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s59
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s58
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s57
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s47
@@ -7500,10 +7494,10 @@ define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB17_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr61_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr59_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr47_lo16
@@ -8781,13 +8775,13 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v1
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v9
@@ -8815,36 +8809,37 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s20
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, s27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, s28
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v51
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v51
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v61
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v35
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v32
@@ -8859,14 +8854,14 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
 ; SI-NEXT:    v_or_b32_e32 v0, v33, v0
-; SI-NEXT:    v_or_b32_e32 v2, v63, v2
-; SI-NEXT:    v_or_b32_e32 v3, v34, v3
-; SI-NEXT:    v_or_b32_e32 v4, v49, v4
+; SI-NEXT:    v_or_b32_e32 v1, v34, v1
+; SI-NEXT:    v_or_b32_e32 v3, v62, v3
+; SI-NEXT:    v_or_b32_e32 v4, v36, v4
 ; SI-NEXT:    v_or_b32_e32 v5, v28, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v26, v6
-; SI-NEXT:    v_or_b32_e32 v7, v60, v7
+; SI-NEXT:    v_or_b32_e32 v7, v48, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v38, v8
-; SI-NEXT:    v_or_b32_e32 v9, v36, v9
+; SI-NEXT:    v_or_b32_e32 v9, v49, v9
 ; SI-NEXT:    v_or_b32_e32 v10, v50, v10
 ; SI-NEXT:    v_or_b32_e32 v11, v59, v11
 ; SI-NEXT:    v_or_b32_e32 v12, v57, v12
@@ -8882,64 +8877,65 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    s_cbranch_execnz .LBB19_3
 ; SI-NEXT:  .LBB19_2: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v51
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v33
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v62
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v36
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v28
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v60
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v50
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v36
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v50
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v58
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v44
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v47
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v44
 ; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v43
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v53
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v54
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v53
 ; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v31
@@ -8953,27 +8949,25 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v63
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v63
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -8993,12 +8987,12 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v60
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v39
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
@@ -9086,89 +9080,86 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB19_4:
 ; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v59, v46
 ; SI-NEXT:    v_mov_b32_e32 v46, v41
 ; SI-NEXT:    v_mov_b32_e32 v41, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v23
-; SI-NEXT:    v_mov_b32_e32 v48, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v60
+; SI-NEXT:    v_mov_b32_e32 v48, v60
 ; SI-NEXT:    v_mov_b32_e32 v60, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v42
 ; SI-NEXT:    v_mov_b32_e32 v42, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v22
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v61
+; SI-NEXT:    v_mov_b32_e32 v35, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v24
-; SI-NEXT:    v_mov_b32_e32 v50, v62
+; SI-NEXT:    v_mov_b32_e32 v50, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v62
 ; SI-NEXT:    v_mov_b32_e32 v62, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v25
-; SI-NEXT:    v_mov_b32_e32 v32, v51
-; SI-NEXT:    v_mov_b32_e32 v51, v33
+; SI-NEXT:    v_mov_b32_e32 v32, v33
 ; SI-NEXT:    v_mov_b32_e32 v33, v63
 ; SI-NEXT:    v_mov_b32_e32 v63, v58
 ; SI-NEXT:    v_mov_b32_e32 v58, v45
 ; SI-NEXT:    v_mov_b32_e32 v45, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v31
-; SI-NEXT:    v_mov_b32_e32 v38, v26
-; SI-NEXT:    v_mov_b32_e32 v37, v27
-; SI-NEXT:    v_mov_b32_e32 v36, v28
-; SI-NEXT:    v_mov_b32_e32 v35, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v29
+; SI-NEXT:    v_mov_b32_e32 v39, v26
+; SI-NEXT:    v_mov_b32_e32 v38, v27
+; SI-NEXT:    v_mov_b32_e32 v37, v28
+; SI-NEXT:    v_mov_b32_e32 v49, v36
+; SI-NEXT:    v_mov_b32_e32 v36, v29
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT:    v_mov_b32_e32 v29, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v35
-; SI-NEXT:    v_mov_b32_e32 v28, v36
-; SI-NEXT:    v_mov_b32_e32 v27, v37
-; SI-NEXT:    v_mov_b32_e32 v26, v38
+; SI-NEXT:    v_mov_b32_e32 v29, v36
+; SI-NEXT:    v_mov_b32_e32 v36, v49
+; SI-NEXT:    v_mov_b32_e32 v28, v37
+; SI-NEXT:    v_mov_b32_e32 v27, v38
+; SI-NEXT:    v_mov_b32_e32 v26, v39
 ; SI-NEXT:    v_mov_b32_e32 v31, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v45
 ; SI-NEXT:    v_mov_b32_e32 v45, v58
 ; SI-NEXT:    v_mov_b32_e32 v58, v63
 ; SI-NEXT:    v_mov_b32_e32 v63, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v51
-; SI-NEXT:    v_mov_b32_e32 v51, v32
+; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_mov_b32_e32 v25, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v62
-; SI-NEXT:    v_mov_b32_e32 v62, v50
+; SI-NEXT:    v_mov_b32_e32 v62, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v50
 ; SI-NEXT:    v_mov_b32_e32 v24, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v61
-; SI-NEXT:    v_mov_b32_e32 v61, v34
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v61, v35
 ; SI-NEXT:    v_mov_b32_e32 v22, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v42
 ; SI-NEXT:    v_mov_b32_e32 v42, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v60
-; SI-NEXT:    v_mov_b32_e32 v60, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v48
+; SI-NEXT:    v_mov_b32_e32 v60, v48
 ; SI-NEXT:    v_mov_b32_e32 v23, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v41
 ; SI-NEXT:    v_mov_b32_e32 v41, v46
 ; SI-NEXT:    v_mov_b32_e32 v46, v59
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_branch .LBB19_2
 ;
@@ -17674,13 +17665,13 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v1
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v9
@@ -17708,36 +17699,37 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s20
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, s27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, s28
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v51
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v51
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v61
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v35
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v32
@@ -17752,14 +17744,14 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
 ; SI-NEXT:    v_or_b32_e32 v0, v33, v0
-; SI-NEXT:    v_or_b32_e32 v2, v63, v2
-; SI-NEXT:    v_or_b32_e32 v3, v34, v3
-; SI-NEXT:    v_or_b32_e32 v4, v49, v4
+; SI-NEXT:    v_or_b32_e32 v1, v34, v1
+; SI-NEXT:    v_or_b32_e32 v3, v62, v3
+; SI-NEXT:    v_or_b32_e32 v4, v36, v4
 ; SI-NEXT:    v_or_b32_e32 v5, v28, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v26, v6
-; SI-NEXT:    v_or_b32_e32 v7, v60, v7
+; SI-NEXT:    v_or_b32_e32 v7, v48, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v38, v8
-; SI-NEXT:    v_or_b32_e32 v9, v36, v9
+; SI-NEXT:    v_or_b32_e32 v9, v49, v9
 ; SI-NEXT:    v_or_b32_e32 v10, v50, v10
 ; SI-NEXT:    v_or_b32_e32 v11, v59, v11
 ; SI-NEXT:    v_or_b32_e32 v12, v57, v12
@@ -17775,64 +17767,65 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    s_cbranch_execnz .LBB35_3
 ; SI-NEXT:  .LBB35_2: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v51
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v33
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v62
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v36
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v28
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v60
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v50
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v36
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v50
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v58
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v44
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v47
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v44
 ; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v43
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v53
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v54
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v53
 ; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v31
@@ -17846,27 +17839,25 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v63
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v63
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -17886,12 +17877,12 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v60
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v39
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
@@ -17979,89 +17970,86 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB35_4:
 ; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v59, v46
 ; SI-NEXT:    v_mov_b32_e32 v46, v41
 ; SI-NEXT:    v_mov_b32_e32 v41, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v23
-; SI-NEXT:    v_mov_b32_e32 v48, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v60
+; SI-NEXT:    v_mov_b32_e32 v48, v60
 ; SI-NEXT:    v_mov_b32_e32 v60, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v42
 ; SI-NEXT:    v_mov_b32_e32 v42, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v22
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v61
+; SI-NEXT:    v_mov_b32_e32 v35, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v24
-; SI-NEXT:    v_mov_b32_e32 v50, v62
+; SI-NEXT:    v_mov_b32_e32 v50, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v62
 ; SI-NEXT:    v_mov_b32_e32 v62, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v25
-; SI-NEXT:    v_mov_b32_e32 v32, v51
-; SI-NEXT:    v_mov_b32_e32 v51, v33
+; SI-NEXT:    v_mov_b32_e32 v32, v33
 ; SI-NEXT:    v_mov_b32_e32 v33, v63
 ; SI-NEXT:    v_mov_b32_e32 v63, v58
 ; SI-NEXT:    v_mov_b32_e32 v58, v45
 ; SI-NEXT:    v_mov_b32_e32 v45, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v31
-; SI-NEXT:    v_mov_b32_e32 v38, v26
-; SI-NEXT:    v_mov_b32_e32 v37, v27
-; SI-NEXT:    v_mov_b32_e32 v36, v28
-; SI-NEXT:    v_mov_b32_e32 v35, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v29
+; SI-NEXT:    v_mov_b32_e32 v39, v26
+; SI-NEXT:    v_mov_b32_e32 v38, v27
+; SI-NEXT:    v_mov_b32_e32 v37, v28
+; SI-NEXT:    v_mov_b32_e32 v49, v36
+; SI-NEXT:    v_mov_b32_e32 v36, v29
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT:    v_mov_b32_e32 v29, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v35
-; SI-NEXT:    v_mov_b32_e32 v28, v36
-; SI-NEXT:    v_mov_b32_e32 v27, v37
-; SI-NEXT:    v_mov_b32_e32 v26, v38
+; SI-NEXT:    v_mov_b32_e32 v29, v36
+; SI-NEXT:    v_mov_b32_e32 v36, v49
+; SI-NEXT:    v_mov_b32_e32 v28, v37
+; SI-NEXT:    v_mov_b32_e32 v27, v38
+; SI-NEXT:    v_mov_b32_e32 v26, v39
 ; SI-NEXT:    v_mov_b32_e32 v31, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v45
 ; SI-NEXT:    v_mov_b32_e32 v45, v58
 ; SI-NEXT:    v_mov_b32_e32 v58, v63
 ; SI-NEXT:    v_mov_b32_e32 v63, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v51
-; SI-NEXT:    v_mov_b32_e32 v51, v32
+; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_mov_b32_e32 v25, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v62
-; SI-NEXT:    v_mov_b32_e32 v62, v50
+; SI-NEXT:    v_mov_b32_e32 v62, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v50
 ; SI-NEXT:    v_mov_b32_e32 v24, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v61
-; SI-NEXT:    v_mov_b32_e32 v61, v34
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v61, v35
 ; SI-NEXT:    v_mov_b32_e32 v22, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v42
 ; SI-NEXT:    v_mov_b32_e32 v42, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v60
-; SI-NEXT:    v_mov_b32_e32 v60, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v48
+; SI-NEXT:    v_mov_b32_e32 v60, v48
 ; SI-NEXT:    v_mov_b32_e32 v23, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v41
 ; SI-NEXT:    v_mov_b32_e32 v41, v46
 ; SI-NEXT:    v_mov_b32_e32 v46, v59
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_branch .LBB35_2
 ;
@@ -20724,7 +20712,7 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s62, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s8, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB41_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -20746,11 +20734,11 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s58
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s62
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB41_3
 ; GFX11-TRUE16-NEXT:  .LBB41_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s6, s6, 3
@@ -20793,22 +20781,19 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB41_3: ; %end
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, s62
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s58
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, s60
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s27, s14
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s26, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s27, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s61
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s58
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s59
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s61
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s60
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s59
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s58
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s57
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s47
@@ -20838,10 +20823,10 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB41_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr61_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr59_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr47_lo16
@@ -24480,7 +24465,7 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s62, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s8, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB45_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -24502,11 +24487,11 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s58
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s62
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB45_3
 ; GFX11-TRUE16-NEXT:  .LBB45_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s6, s6, 3
@@ -24549,22 +24534,19 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s56, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB45_3: ; %end
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, s62
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s58
-; GFX11-TRUE16-NEXT:    s_mov_b32 s58, s60
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s27, s14
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s26, s28
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s27, s29
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s61
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s58
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s59
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s61
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s60
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s59
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s58
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s57
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s56
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s47
@@ -24594,10 +24576,10 @@ define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB45_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr61_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr59_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr57_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr47_lo16
@@ -25875,13 +25857,13 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v1
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v9
@@ -25909,36 +25891,37 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s20
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, s27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, s28
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v51
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v51
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v61
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v35
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v32
@@ -25953,14 +25936,14 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
 ; SI-NEXT:    v_or_b32_e32 v0, v33, v0
-; SI-NEXT:    v_or_b32_e32 v2, v63, v2
-; SI-NEXT:    v_or_b32_e32 v3, v34, v3
-; SI-NEXT:    v_or_b32_e32 v4, v49, v4
+; SI-NEXT:    v_or_b32_e32 v1, v34, v1
+; SI-NEXT:    v_or_b32_e32 v3, v62, v3
+; SI-NEXT:    v_or_b32_e32 v4, v36, v4
 ; SI-NEXT:    v_or_b32_e32 v5, v28, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v26, v6
-; SI-NEXT:    v_or_b32_e32 v7, v60, v7
+; SI-NEXT:    v_or_b32_e32 v7, v48, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v38, v8
-; SI-NEXT:    v_or_b32_e32 v9, v36, v9
+; SI-NEXT:    v_or_b32_e32 v9, v49, v9
 ; SI-NEXT:    v_or_b32_e32 v10, v50, v10
 ; SI-NEXT:    v_or_b32_e32 v11, v59, v11
 ; SI-NEXT:    v_or_b32_e32 v12, v57, v12
@@ -25976,64 +25959,65 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v51
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v33
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v62
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v36
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v28
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v60
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v50
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v36
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v50
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v58
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v44
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v47
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v44
 ; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v43
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v53
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v54
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v53
 ; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v31
@@ -26047,27 +26031,25 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v63
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v63
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -26087,12 +26069,12 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v60
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v39
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
@@ -26180,89 +26162,86 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB47_4:
 ; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v59, v46
 ; SI-NEXT:    v_mov_b32_e32 v46, v41
 ; SI-NEXT:    v_mov_b32_e32 v41, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v23
-; SI-NEXT:    v_mov_b32_e32 v48, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v60
+; SI-NEXT:    v_mov_b32_e32 v48, v60
 ; SI-NEXT:    v_mov_b32_e32 v60, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v42
 ; SI-NEXT:    v_mov_b32_e32 v42, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v22
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v61
+; SI-NEXT:    v_mov_b32_e32 v35, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v24
-; SI-NEXT:    v_mov_b32_e32 v50, v62
+; SI-NEXT:    v_mov_b32_e32 v50, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v62
 ; SI-NEXT:    v_mov_b32_e32 v62, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v25
-; SI-NEXT:    v_mov_b32_e32 v32, v51
-; SI-NEXT:    v_mov_b32_e32 v51, v33
+; SI-NEXT:    v_mov_b32_e32 v32, v33
 ; SI-NEXT:    v_mov_b32_e32 v33, v63
 ; SI-NEXT:    v_mov_b32_e32 v63, v58
 ; SI-NEXT:    v_mov_b32_e32 v58, v45
 ; SI-NEXT:    v_mov_b32_e32 v45, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v31
-; SI-NEXT:    v_mov_b32_e32 v38, v26
-; SI-NEXT:    v_mov_b32_e32 v37, v27
-; SI-NEXT:    v_mov_b32_e32 v36, v28
-; SI-NEXT:    v_mov_b32_e32 v35, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v29
+; SI-NEXT:    v_mov_b32_e32 v39, v26
+; SI-NEXT:    v_mov_b32_e32 v38, v27
+; SI-NEXT:    v_mov_b32_e32 v37, v28
+; SI-NEXT:    v_mov_b32_e32 v49, v36
+; SI-NEXT:    v_mov_b32_e32 v36, v29
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT:    v_mov_b32_e32 v29, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v35
-; SI-NEXT:    v_mov_b32_e32 v28, v36
-; SI-NEXT:    v_mov_b32_e32 v27, v37
-; SI-NEXT:    v_mov_b32_e32 v26, v38
+; SI-NEXT:    v_mov_b32_e32 v29, v36
+; SI-NEXT:    v_mov_b32_e32 v36, v49
+; SI-NEXT:    v_mov_b32_e32 v28, v37
+; SI-NEXT:    v_mov_b32_e32 v27, v38
+; SI-NEXT:    v_mov_b32_e32 v26, v39
 ; SI-NEXT:    v_mov_b32_e32 v31, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v45
 ; SI-NEXT:    v_mov_b32_e32 v45, v58
 ; SI-NEXT:    v_mov_b32_e32 v58, v63
 ; SI-NEXT:    v_mov_b32_e32 v63, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v51
-; SI-NEXT:    v_mov_b32_e32 v51, v32
+; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_mov_b32_e32 v25, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v62
-; SI-NEXT:    v_mov_b32_e32 v62, v50
+; SI-NEXT:    v_mov_b32_e32 v62, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v50
 ; SI-NEXT:    v_mov_b32_e32 v24, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v61
-; SI-NEXT:    v_mov_b32_e32 v61, v34
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v61, v35
 ; SI-NEXT:    v_mov_b32_e32 v22, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v42
 ; SI-NEXT:    v_mov_b32_e32 v42, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v60
-; SI-NEXT:    v_mov_b32_e32 v60, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v48
+; SI-NEXT:    v_mov_b32_e32 v60, v48
 ; SI-NEXT:    v_mov_b32_e32 v23, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v41
 ; SI-NEXT:    v_mov_b32_e32 v41, v46
 ; SI-NEXT:    v_mov_b32_e32 v46, v59
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_branch .LBB47_2
 ;
@@ -33131,13 +33110,13 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v1
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v9
@@ -33165,36 +33144,37 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s20
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, s27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, s28
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v51
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v51
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v61
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v37
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v35
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v32
@@ -33209,14 +33189,14 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
 ; SI-NEXT:    v_or_b32_e32 v0, v33, v0
-; SI-NEXT:    v_or_b32_e32 v2, v63, v2
-; SI-NEXT:    v_or_b32_e32 v3, v34, v3
-; SI-NEXT:    v_or_b32_e32 v4, v49, v4
+; SI-NEXT:    v_or_b32_e32 v1, v34, v1
+; SI-NEXT:    v_or_b32_e32 v3, v62, v3
+; SI-NEXT:    v_or_b32_e32 v4, v36, v4
 ; SI-NEXT:    v_or_b32_e32 v5, v28, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v26, v6
-; SI-NEXT:    v_or_b32_e32 v7, v60, v7
+; SI-NEXT:    v_or_b32_e32 v7, v48, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v38, v8
-; SI-NEXT:    v_or_b32_e32 v9, v36, v9
+; SI-NEXT:    v_or_b32_e32 v9, v49, v9
 ; SI-NEXT:    v_or_b32_e32 v10, v50, v10
 ; SI-NEXT:    v_or_b32_e32 v11, v59, v11
 ; SI-NEXT:    v_or_b32_e32 v12, v57, v12
@@ -33232,64 +33212,65 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    s_cbranch_execnz .LBB55_3
 ; SI-NEXT:  .LBB55_2: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v51
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v33
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v62
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v36
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v28
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v60
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v50
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v36
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v50
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v58
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v44
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v47
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v44
 ; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v43
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v53
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v54
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v53
 ; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v31
@@ -33303,27 +33284,25 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v63
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v63
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -33343,12 +33322,12 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v60
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v39
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
@@ -33436,89 +33415,86 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB55_4:
 ; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v59, v46
 ; SI-NEXT:    v_mov_b32_e32 v46, v41
 ; SI-NEXT:    v_mov_b32_e32 v41, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v23
-; SI-NEXT:    v_mov_b32_e32 v48, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v60
+; SI-NEXT:    v_mov_b32_e32 v48, v60
 ; SI-NEXT:    v_mov_b32_e32 v60, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v42
 ; SI-NEXT:    v_mov_b32_e32 v42, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v22
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v34, v61
+; SI-NEXT:    v_mov_b32_e32 v35, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v24
-; SI-NEXT:    v_mov_b32_e32 v50, v62
+; SI-NEXT:    v_mov_b32_e32 v50, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v62
 ; SI-NEXT:    v_mov_b32_e32 v62, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v25
-; SI-NEXT:    v_mov_b32_e32 v32, v51
-; SI-NEXT:    v_mov_b32_e32 v51, v33
+; SI-NEXT:    v_mov_b32_e32 v32, v33
 ; SI-NEXT:    v_mov_b32_e32 v33, v63
 ; SI-NEXT:    v_mov_b32_e32 v63, v58
 ; SI-NEXT:    v_mov_b32_e32 v58, v45
 ; SI-NEXT:    v_mov_b32_e32 v45, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v31
-; SI-NEXT:    v_mov_b32_e32 v38, v26
-; SI-NEXT:    v_mov_b32_e32 v37, v27
-; SI-NEXT:    v_mov_b32_e32 v36, v28
-; SI-NEXT:    v_mov_b32_e32 v35, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v29
+; SI-NEXT:    v_mov_b32_e32 v39, v26
+; SI-NEXT:    v_mov_b32_e32 v38, v27
+; SI-NEXT:    v_mov_b32_e32 v37, v28
+; SI-NEXT:    v_mov_b32_e32 v49, v36
+; SI-NEXT:    v_mov_b32_e32 v36, v29
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT:    v_mov_b32_e32 v29, v49
-; SI-NEXT:    v_mov_b32_e32 v49, v35
-; SI-NEXT:    v_mov_b32_e32 v28, v36
-; SI-NEXT:    v_mov_b32_e32 v27, v37
-; SI-NEXT:    v_mov_b32_e32 v26, v38
+; SI-NEXT:    v_mov_b32_e32 v29, v36
+; SI-NEXT:    v_mov_b32_e32 v36, v49
+; SI-NEXT:    v_mov_b32_e32 v28, v37
+; SI-NEXT:    v_mov_b32_e32 v27, v38
+; SI-NEXT:    v_mov_b32_e32 v26, v39
 ; SI-NEXT:    v_mov_b32_e32 v31, v40
 ; SI-NEXT:    v_mov_b32_e32 v40, v45
 ; SI-NEXT:    v_mov_b32_e32 v45, v58
 ; SI-NEXT:    v_mov_b32_e32 v58, v63
 ; SI-NEXT:    v_mov_b32_e32 v63, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v51
-; SI-NEXT:    v_mov_b32_e32 v51, v32
+; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_mov_b32_e32 v25, v55
 ; SI-NEXT:    v_mov_b32_e32 v55, v44
 ; SI-NEXT:    v_mov_b32_e32 v44, v57
 ; SI-NEXT:    v_mov_b32_e32 v57, v62
-; SI-NEXT:    v_mov_b32_e32 v62, v50
+; SI-NEXT:    v_mov_b32_e32 v62, v34
+; SI-NEXT:    v_mov_b32_e32 v34, v50
 ; SI-NEXT:    v_mov_b32_e32 v24, v54
 ; SI-NEXT:    v_mov_b32_e32 v54, v43
 ; SI-NEXT:    v_mov_b32_e32 v43, v56
 ; SI-NEXT:    v_mov_b32_e32 v56, v61
-; SI-NEXT:    v_mov_b32_e32 v61, v34
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v61, v35
 ; SI-NEXT:    v_mov_b32_e32 v22, v53
 ; SI-NEXT:    v_mov_b32_e32 v53, v42
 ; SI-NEXT:    v_mov_b32_e32 v42, v47
 ; SI-NEXT:    v_mov_b32_e32 v47, v60
-; SI-NEXT:    v_mov_b32_e32 v60, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v48
+; SI-NEXT:    v_mov_b32_e32 v60, v48
 ; SI-NEXT:    v_mov_b32_e32 v23, v52
 ; SI-NEXT:    v_mov_b32_e32 v52, v41
 ; SI-NEXT:    v_mov_b32_e32 v41, v46
 ; SI-NEXT:    v_mov_b32_e32 v46, v59
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_branch .LBB55_2
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index d84d3230f9538..a2bd1d30cc634 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -3868,7 +3868,7 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v5
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s74, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s10, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -3892,11 +3892,11 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s62
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s74
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB13_3
 ; GFX11-TRUE16-NEXT:  .LBB13_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -3943,18 +3943,16 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB13_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, s74
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s73
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s62
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, s72
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s63
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s62
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s73
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s72
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s63
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s62
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s61
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s60
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s59
@@ -3989,10 +3987,10 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB13_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr73_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr72_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr63_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr61_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr59_lo16
@@ -6146,9 +6144,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s25, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s24, 16
@@ -6164,7 +6162,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
@@ -6180,9 +6178,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB15_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -6199,9 +6197,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB15_3
 ; GFX11-FAKE16-NEXT:  .LBB15_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
@@ -6225,9 +6223,9 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -8057,7 +8055,7 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v5
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s74, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s10, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB17_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -8081,11 +8079,11 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s62
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s74
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB17_3
 ; GFX11-TRUE16-NEXT:  .LBB17_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -8132,18 +8130,16 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB17_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, s74
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s73
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s62
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, s72
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s63
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s62
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s73
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s72
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s63
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s62
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s61
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s60
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s59
@@ -8178,10 +8174,10 @@ define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB17_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr73_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr72_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr63_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr61_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr59_lo16
@@ -10607,9 +10603,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s25, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s24, 16
@@ -10625,7 +10621,7 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
@@ -10641,9 +10637,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB19_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -10660,9 +10656,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB19_3
 ; GFX11-FAKE16-NEXT:  .LBB19_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
@@ -10686,9 +10682,9 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -15936,9 +15932,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s25, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s24, 16
@@ -15954,7 +15950,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
@@ -15970,9 +15966,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB31_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -15989,9 +15985,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB31_3
 ; GFX11-FAKE16-NEXT:  .LBB31_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
@@ -16015,9 +16011,9 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -20391,9 +20387,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s25, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s24, 16
@@ -20409,7 +20405,7 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
@@ -20425,9 +20421,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB35_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -20444,9 +20440,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB35_3
 ; GFX11-FAKE16-NEXT:  .LBB35_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
@@ -20470,9 +20466,9 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -22707,7 +22703,7 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v5
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s74, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s10, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB41_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -22731,11 +22727,11 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s62
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s74
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB41_3
 ; GFX11-TRUE16-NEXT:  .LBB41_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s6, s6, 3
@@ -22782,18 +22778,16 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB41_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, s74
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s73
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s62
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, s72
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s63
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s62
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s73
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s72
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s63
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s62
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s61
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s60
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s59
@@ -22828,10 +22822,10 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB41_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr73_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr72_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr63_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr61_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr59_lo16
@@ -24985,9 +24979,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s25, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s24, 16
@@ -25003,7 +24997,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
@@ -25019,9 +25013,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB43_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -25038,9 +25032,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB43_3
 ; GFX11-FAKE16-NEXT:  .LBB43_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
@@ -25064,9 +25058,9 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -26908,7 +26902,7 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v4
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v5
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s74, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s10, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB45_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -26932,11 +26926,11 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s62
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s74
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB45_3
 ; GFX11-TRUE16-NEXT:  .LBB45_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s6, s6, 3
@@ -26983,18 +26977,16 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB45_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, s74
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s73
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s62
-; GFX11-TRUE16-NEXT:    s_mov_b32 s62, s72
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s63
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s62
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s73
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s72
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s63
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s62
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s61
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s60
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s59
@@ -27029,10 +27021,10 @@ define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB45_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr73_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr72_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr63_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr62_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr61_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr60_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr59_lo16
@@ -29458,9 +29450,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s25, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s24, 16
@@ -29476,7 +29468,7 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
@@ -29492,9 +29484,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB47_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -29511,9 +29503,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB47_3
 ; GFX11-FAKE16-NEXT:  .LBB47_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
@@ -29537,9 +29529,9 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -33155,9 +33147,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s25, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s24, 16
@@ -33173,7 +33165,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
@@ -33189,9 +33181,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB51_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -33208,9 +33200,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB51_3
 ; GFX11-FAKE16-NEXT:  .LBB51_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
@@ -33234,9 +33226,9 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -37528,9 +37520,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s25, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s24, 16
@@ -37546,7 +37538,7 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
@@ -37562,9 +37554,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB55_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -37581,9 +37573,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB55_3
 ; GFX11-FAKE16-NEXT:  .LBB55_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
@@ -37607,9 +37599,9 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -39754,17 +39746,16 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v7, 16, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s23, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s22, s14
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
@@ -39774,7 +39765,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v8, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v9, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s15, s13
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s14, s11
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
@@ -39788,7 +39779,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v0
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s17
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s7, 3 op_sel_hi:[1,0]
@@ -39816,18 +39807,18 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, s2, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v50.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v25
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
@@ -39837,15 +39828,16 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v50.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v51.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v53.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v55.l
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
 ; GFX11-TRUE16-NEXT:  .LBB57_3:
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
@@ -39863,15 +39855,15 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, s19
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, s18
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, s17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, s16
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, s3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, s2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, s45
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, s44
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, s43
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, s42
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, s44
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, s43
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, s42
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, s41
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, s40
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, s15
@@ -39888,30 +39880,31 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, s9
 ; GFX11-TRUE16-NEXT:  .LBB57_5: ; %end
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v33, 16, v18
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v15
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v38, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v9
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v49, 16, v51
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v25
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v30, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v31, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v33, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v34, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v34, 16, v23
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v14
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v21, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v21, 16, v18
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v39, 16, v26
@@ -39919,9 +39912,8 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v23, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v19, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v19, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v22, 16, v23
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v30, 16, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -39935,6 +39927,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v27
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v48, 16, v50
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v35, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v36, 16, v7
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v37, 16, v38
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v32, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v0, 16, v1
@@ -39943,9 +39936,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v5
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v26
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v36, 16, v7
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v29
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v48f16_scalar:
@@ -41950,17 +41941,16 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v7, 16, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s23, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s22, s14
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
@@ -41970,7 +41960,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v8, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v9, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s15, s13
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s14, s11
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
@@ -41984,7 +41974,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v0
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s17
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s7 op_sel_hi:[0,1]
@@ -42012,18 +42002,18 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, s2 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v50.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v25
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
@@ -42033,15 +42023,16 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v50.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v51.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v53.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v55.l
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
 ; GFX11-TRUE16-NEXT:  .LBB59_3:
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
@@ -42059,15 +42050,15 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, s19
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, s18
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, s17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, s16
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, s3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, s2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, s45
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, s44
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, s43
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, s42
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, s44
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, s43
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, s42
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, s41
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, s40
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, s15
@@ -42084,30 +42075,31 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, s9
 ; GFX11-TRUE16-NEXT:  .LBB59_5: ; %end
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v33, 16, v18
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v15
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v38, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v9
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v49, 16, v51
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v25
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v30, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v31, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v33, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v34, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v34, 16, v23
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v14
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v21, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v21, 16, v18
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v39, 16, v26
@@ -42115,9 +42107,8 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v23, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v19, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v19, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v22, 16, v23
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v30, 16, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -42131,6 +42122,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v27
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v48, 16, v50
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v35, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v36, 16, v7
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v37, 16, v38
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v32, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v0, 16, v1
@@ -42139,9 +42131,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v5
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v26
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v36, 16, v7
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v29
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v48i16_scalar:

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index f4b487be4316b..fbee320c82c7f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -4145,7 +4145,7 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v6
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v7
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s78, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s12, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -4171,11 +4171,11 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s74
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s78
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB13_3
 ; GFX11-TRUE16-NEXT:  .LBB13_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -4226,18 +4226,16 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB13_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, s78
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s77
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s74
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, s76
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s75
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s74
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s77
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s76
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s75
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s74
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s73
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s72
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s63
@@ -4275,10 +4273,10 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB13_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr77_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr76_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr75_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr73_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr72_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr63_lo16
@@ -6640,8 +6638,8 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s28, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s25, 16
@@ -6658,7 +6656,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
@@ -6674,9 +6672,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB15_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -6695,9 +6693,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB15_3
 ; GFX11-FAKE16-NEXT:  .LBB15_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
@@ -6723,9 +6721,9 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -8758,7 +8756,7 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v6
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v7
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s78, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s12, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB17_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -8784,11 +8782,11 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s74
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s78
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB17_3
 ; GFX11-TRUE16-NEXT:  .LBB17_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -8839,18 +8837,16 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB17_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, s78
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s77
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s74
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, s76
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s75
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s74
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s77
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s76
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s75
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s74
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s73
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s72
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s63
@@ -8888,10 +8884,10 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB17_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr77_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr76_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr75_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr73_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr72_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr63_lo16
@@ -10468,15 +10464,15 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
@@ -10494,13 +10490,13 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, s24
@@ -10542,9 +10538,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_4
@@ -10555,18 +10551,17 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v0, v2, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
+; SI-NEXT:    v_or_b32_e32 v0, v11, v0
 ; SI-NEXT:    v_or_b32_e32 v2, v14, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
@@ -10574,12 +10569,11 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v46
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
@@ -10591,10 +10585,10 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v51, v46
 ; SI-NEXT:    v_or_b32_e32 v7, v45, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v40, v8
-; SI-NEXT:    v_or_b32_e32 v9, v42, v9
+; SI-NEXT:    v_or_b32_e32 v9, v55, v9
 ; SI-NEXT:    v_or_b32_e32 v10, v54, v10
-; SI-NEXT:    v_or_b32_e32 v11, v57, v11
-; SI-NEXT:    v_or_b32_e32 v12, v53, v12
+; SI-NEXT:    v_or_b32_e32 v11, v47, v11
+; SI-NEXT:    v_or_b32_e32 v12, v60, v12
 ; SI-NEXT:    v_or_b32_e32 v13, v52, v13
 ; SI-NEXT:    v_or_b32_e32 v14, v63, v14
 ; SI-NEXT:    v_or_b32_e32 v15, v61, v15
@@ -10622,16 +10616,14 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB19_3
 ; SI-NEXT:  .LBB19_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v42
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v54
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -10643,8 +10635,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v57
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
@@ -10744,7 +10736,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
@@ -10782,7 +10774,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v8, v9, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v42
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -10792,12 +10784,12 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v43
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v57
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -11540,8 +11532,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s28, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s25, 16
@@ -11558,7 +11550,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
@@ -11574,9 +11566,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB19_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -11595,9 +11587,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB19_3
 ; GFX11-FAKE16-NEXT:  .LBB19_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
@@ -11623,9 +11615,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -17344,8 +17336,8 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s28, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s25, 16
@@ -17362,7 +17354,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
@@ -17378,9 +17370,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB31_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -17399,9 +17391,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB31_3
 ; GFX11-FAKE16-NEXT:  .LBB31_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
@@ -17427,9 +17419,9 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -18596,9 +18588,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    s_lshr_b32 s4, s10, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, s4
 ; SI-NEXT:    s_lshr_b32 s4, s11, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, s4
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, s4
 ; SI-NEXT:    s_lshr_b32 s4, s12, 16
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, s4
 ; SI-NEXT:    s_lshr_b32 s4, s13, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, s4
@@ -18642,8 +18634,8 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, s6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, s7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, s8
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, s10
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, s11
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, s10
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, s11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, s12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, s13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, s14
@@ -18658,7 +18650,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, s24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, s23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, s22
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, s21
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, s21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, s20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, s19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, s18
@@ -18667,16 +18659,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    s_cbranch_execnz .LBB33_3
 ; SI-NEXT:  .LBB33_2: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e64 v1, s16, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_add_f32_e64 v6, s6, 1.0
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e64 v3, s18, 1.0
 ; SI-NEXT:    v_add_f32_e64 v5, s19, 1.0
 ; SI-NEXT:    v_add_f32_e64 v7, s20, 1.0
@@ -18699,36 +18682,47 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_add_f32_e64 v12, s10, 1.0
 ; SI-NEXT:    v_add_f32_e64 v10, s8, 1.0
 ; SI-NEXT:    v_add_f32_e64 v8, s7, 1.0
+; SI-NEXT:    v_add_f32_e64 v6, s6, 1.0
 ; SI-NEXT:    v_add_f32_e64 v29, s9, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v25
 ; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v23
 ; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v21
+; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v19
+; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v17
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v15
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v13
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v12
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v10
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
@@ -18743,37 +18737,38 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v63
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v62
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v61
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v57
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v47
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v44
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v50
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v44
 ; SI-NEXT:  .LBB33_3: ; %end
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v44
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v43
@@ -18810,7 +18805,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v4, v51, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v9
 ; SI-NEXT:    v_add_i32_e32 v49, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v30, v4
@@ -18914,15 +18909,15 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v4, v6, v4
 ; SI-NEXT:    buffer_store_dword v4, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v48
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x50, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v6, v4
 ; SI-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v38
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x54, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v6, v4
@@ -18984,7 +18979,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr39
@@ -19014,9 +19009,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr5
@@ -21168,15 +21163,15 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
@@ -21194,13 +21189,13 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, s24
@@ -21242,9 +21237,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_4
@@ -21255,18 +21250,17 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v0, v2, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
+; SI-NEXT:    v_or_b32_e32 v0, v11, v0
 ; SI-NEXT:    v_or_b32_e32 v2, v14, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
@@ -21274,12 +21268,11 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v46
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
@@ -21291,10 +21284,10 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v51, v46
 ; SI-NEXT:    v_or_b32_e32 v7, v45, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v40, v8
-; SI-NEXT:    v_or_b32_e32 v9, v42, v9
+; SI-NEXT:    v_or_b32_e32 v9, v55, v9
 ; SI-NEXT:    v_or_b32_e32 v10, v54, v10
-; SI-NEXT:    v_or_b32_e32 v11, v57, v11
-; SI-NEXT:    v_or_b32_e32 v12, v53, v12
+; SI-NEXT:    v_or_b32_e32 v11, v47, v11
+; SI-NEXT:    v_or_b32_e32 v12, v60, v12
 ; SI-NEXT:    v_or_b32_e32 v13, v52, v13
 ; SI-NEXT:    v_or_b32_e32 v14, v63, v14
 ; SI-NEXT:    v_or_b32_e32 v15, v61, v15
@@ -21322,16 +21315,14 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB35_3
 ; SI-NEXT:  .LBB35_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v42
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v54
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -21343,8 +21334,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v57
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
@@ -21444,7 +21435,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
@@ -21482,7 +21473,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v8, v9, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v42
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -21492,12 +21483,12 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v43
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v57
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -22240,8 +22231,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s28, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s25, 16
@@ -22258,7 +22249,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
@@ -22274,9 +22265,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB35_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -22295,9 +22286,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB35_3
 ; GFX11-FAKE16-NEXT:  .LBB35_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
@@ -22323,9 +22314,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -24751,7 +24742,7 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v6
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v7
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s78, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s12, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB41_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -24777,11 +24768,11 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s74
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s78
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB41_3
 ; GFX11-TRUE16-NEXT:  .LBB41_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s6, s6, 3
@@ -24832,18 +24823,16 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB41_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, s78
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s77
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s74
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, s76
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s75
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s74
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s77
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s76
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s75
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s74
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s73
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s72
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s63
@@ -24881,10 +24870,10 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB41_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr77_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr76_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr75_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr73_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr72_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr63_lo16
@@ -27246,8 +27235,8 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s28, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s25, 16
@@ -27264,7 +27253,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
@@ -27280,9 +27269,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB43_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -27301,9 +27290,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB43_3
 ; GFX11-FAKE16-NEXT:  .LBB43_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
@@ -27329,9 +27318,9 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -29379,7 +29368,7 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v5
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v6
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v7
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s78, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s12, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB45_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -29405,11 +29394,11 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s74
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s78
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB45_3
 ; GFX11-TRUE16-NEXT:  .LBB45_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s6, s6, 3
@@ -29460,18 +29449,16 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB45_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, s78
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s77
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s74
-; GFX11-TRUE16-NEXT:    s_mov_b32 s74, s76
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s75
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s74
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s77
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s76
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s75
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s74
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s73
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s72
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s63
@@ -29509,10 +29496,10 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB45_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr77_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr76_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr75_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr73_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr72_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr63_lo16
@@ -31089,15 +31076,15 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
@@ -31115,13 +31102,13 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, s24
@@ -31163,9 +31150,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
@@ -31176,18 +31163,17 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v0, v2, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
+; SI-NEXT:    v_or_b32_e32 v0, v11, v0
 ; SI-NEXT:    v_or_b32_e32 v2, v14, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
@@ -31195,12 +31181,11 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v46
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
@@ -31212,10 +31197,10 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v51, v46
 ; SI-NEXT:    v_or_b32_e32 v7, v45, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v40, v8
-; SI-NEXT:    v_or_b32_e32 v9, v42, v9
+; SI-NEXT:    v_or_b32_e32 v9, v55, v9
 ; SI-NEXT:    v_or_b32_e32 v10, v54, v10
-; SI-NEXT:    v_or_b32_e32 v11, v57, v11
-; SI-NEXT:    v_or_b32_e32 v12, v53, v12
+; SI-NEXT:    v_or_b32_e32 v11, v47, v11
+; SI-NEXT:    v_or_b32_e32 v12, v60, v12
 ; SI-NEXT:    v_or_b32_e32 v13, v52, v13
 ; SI-NEXT:    v_or_b32_e32 v14, v63, v14
 ; SI-NEXT:    v_or_b32_e32 v15, v61, v15
@@ -31243,16 +31228,14 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v42
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v54
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -31264,8 +31247,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v57
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
@@ -31365,7 +31348,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
@@ -31403,7 +31386,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v8, v9, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v42
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -31413,12 +31396,12 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v43
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v57
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -32161,8 +32144,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s28, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s25, 16
@@ -32179,7 +32162,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
@@ -32195,9 +32178,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB47_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -32216,9 +32199,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB47_3
 ; GFX11-FAKE16-NEXT:  .LBB47_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
@@ -32244,9 +32227,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -36245,8 +36228,8 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s28, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s25, 16
@@ -36263,7 +36246,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
@@ -36279,9 +36262,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB51_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -36300,9 +36283,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB51_3
 ; GFX11-FAKE16-NEXT:  .LBB51_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
@@ -36328,9 +36311,9 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
 ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -37486,12 +37469,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, s40
 ; SI-NEXT:    s_lshr_b32 s40, s16, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, s40
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, s5
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, s7
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, s9
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, s8
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, s8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, s11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, s10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, s13
@@ -37517,24 +37500,22 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_add_f64 v[1:2], s[16:17], 1.0
 ; SI-NEXT:    v_add_f64 v[54:55], s[18:19], 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_add_f64 v[49:50], s[20:21], 1.0
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v54
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_f64 v[3:4], s[4:5], 1.0
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v55
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v49
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v55
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v44
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f64 v[49:50], s[20:21], 1.0
 ; SI-NEXT:    v_add_f64 v[37:38], s[22:23], 1.0
 ; SI-NEXT:    v_add_f64 v[33:34], s[24:25], 1.0
 ; SI-NEXT:    v_add_f64 v[31:32], s[26:27], 1.0
@@ -37544,8 +37525,8 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_add_f64 v[14:15], s[10:11], 1.0
 ; SI-NEXT:    v_add_f64 v[11:12], s[8:9], 1.0
 ; SI-NEXT:    v_add_f64 v[7:8], s[6:7], 1.0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f64 v[3:4], s[4:5], 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v50
 ; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v37
 ; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v38
 ; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v33
@@ -37564,14 +37545,13 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
@@ -37587,10 +37567,9 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v46
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v63
@@ -37609,187 +37588,191 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v51
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v10
 ; SI-NEXT:  .LBB53_3: ; %end
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, v43
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
-; SI-NEXT:    v_or_b32_e32 v43, v43, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v42
+; SI-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v43, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v41, v41, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    buffer_store_dword v41, v43, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 4, v0
+; SI-NEXT:    v_or_b32_e32 v13, v41, v13
+; SI-NEXT:    buffer_store_dword v13, v10, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v40
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v41, vcc, 8, v0
-; SI-NEXT:    v_or_b32_e32 v55, v55, v40
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    buffer_store_dword v55, v41, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v55
+; SI-NEXT:    v_add_i32_e32 v55, vcc, 8, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v55, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v55, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v53, v53, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    buffer_store_dword v53, v55, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v53
+; SI-NEXT:    v_add_i32_e32 v53, vcc, 12, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v53, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v53, vcc, 16, v0
-; SI-NEXT:    v_or_b32_e32 v51, v52, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    buffer_store_dword v51, v53, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v52
+; SI-NEXT:    v_add_i32_e32 v51, vcc, 16, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v51, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v51, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v49, v50, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    buffer_store_dword v49, v51, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v50
+; SI-NEXT:    v_add_i32_e32 v49, vcc, 20, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v49, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v49, vcc, 24, v0
-; SI-NEXT:    v_or_b32_e32 v39, v48, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    buffer_store_dword v39, v49, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v48
+; SI-NEXT:    v_add_i32_e32 v39, vcc, 24, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v39, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v39, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v37, v38, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    buffer_store_dword v37, v39, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v38
+; SI-NEXT:    v_add_i32_e32 v37, vcc, 28, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v37, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 32, v0
-; SI-NEXT:    v_or_b32_e32 v35, v36, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
-; SI-NEXT:    buffer_store_dword v35, v37, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v36
+; SI-NEXT:    v_add_i32_e32 v35, vcc, 32, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v35, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v35, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v33, v34, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT:    buffer_store_dword v33, v35, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v34
+; SI-NEXT:    v_add_i32_e32 v33, vcc, 36, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v33, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v33, vcc, 40, v0
-; SI-NEXT:    v_or_b32_e32 v31, v32, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT:    buffer_store_dword v31, v33, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v32
+; SI-NEXT:    v_add_i32_e32 v31, vcc, 40, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v31, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v31, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v28, v30, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    buffer_store_dword v28, v31, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v30
+; SI-NEXT:    v_add_i32_e32 v28, vcc, 44, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v28, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v29
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 48, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_or_b32_e32 v26, v28, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    buffer_store_dword v26, v29, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v29
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 48, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v26, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v27
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 52, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_or_b32_e32 v24, v26, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    buffer_store_dword v24, v27, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v27
+; SI-NEXT:    v_add_i32_e32 v24, vcc, 52, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v24, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 56, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_or_b32_e32 v22, v24, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    buffer_store_dword v22, v25, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v25
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 56, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v22, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 60, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_or_b32_e32 v20, v22, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    buffer_store_dword v20, v23, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v23
+; SI-NEXT:    v_add_i32_e32 v20, vcc, 60, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v20, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v21
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_or_b32_e32 v18, v20, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    buffer_store_dword v18, v21, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v21
+; SI-NEXT:    v_add_i32_e32 v18, vcc, 64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v18, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 0x44, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_or_b32_e32 v16, v18, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    buffer_store_dword v16, v19, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v19
+; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x44, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v16, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 0x48, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_or_b32_e32 v14, v16, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    buffer_store_dword v14, v17, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v17
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x48, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v13, v10
+; SI-NEXT:    buffer_store_dword v10, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v15
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 0x4c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v15
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x4c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v11, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    buffer_store_dword v11, v15, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v10, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v13
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x50, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v45
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x50, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v9, v11, v9
+; SI-NEXT:    v_or_b32_e32 v9, v10, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    buffer_store_dword v9, v13, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v9, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v12
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x54, v0
 ; SI-NEXT:    v_or_b32_e32 v7, v9, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    buffer_store_dword v7, v11, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v10
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x58, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v7, v5
-; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x58, v0
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x5c, v0
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x60, v0
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x60, v0
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v46
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -37853,17 +37836,17 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    s_branch .LBB53_2
 ;
@@ -39981,15 +39964,15 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
@@ -40007,13 +39990,13 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, s24
@@ -40055,9 +40038,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_4
@@ -40068,18 +40051,17 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v0, v2, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
+; SI-NEXT:    v_or_b32_e32 v0, v11, v0
 ; SI-NEXT:    v_or_b32_e32 v2, v14, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
@@ -40087,12 +40069,11 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v46
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
@@ -40104,10 +40085,10 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v51, v46
 ; SI-NEXT:    v_or_b32_e32 v7, v45, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v40, v8
-; SI-NEXT:    v_or_b32_e32 v9, v42, v9
+; SI-NEXT:    v_or_b32_e32 v9, v55, v9
 ; SI-NEXT:    v_or_b32_e32 v10, v54, v10
-; SI-NEXT:    v_or_b32_e32 v11, v57, v11
-; SI-NEXT:    v_or_b32_e32 v12, v53, v12
+; SI-NEXT:    v_or_b32_e32 v11, v47, v11
+; SI-NEXT:    v_or_b32_e32 v12, v60, v12
 ; SI-NEXT:    v_or_b32_e32 v13, v52, v13
 ; SI-NEXT:    v_or_b32_e32 v14, v63, v14
 ; SI-NEXT:    v_or_b32_e32 v15, v61, v15
@@ -40135,16 +40116,14 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB55_3
 ; SI-NEXT:  .LBB55_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v42
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v54
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -40156,8 +40135,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v57
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
@@ -40257,7 +40236,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
@@ -40295,7 +40274,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v8, v9, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v42
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -40305,12 +40284,12 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v43
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v57
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -41053,8 +41032,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s28, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s27, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s26, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s25, 16
@@ -41071,7 +41050,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s2, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s1, 16
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT:    s_mov_b32 s15, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s40, 0
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
@@ -41087,9 +41066,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
 ; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s27, s42
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s41
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
 ; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB55_4
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -41108,9 +41087,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s18
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB55_3
 ; GFX11-FAKE16-NEXT:  .LBB55_2: ; %cmp.true
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
@@ -41136,9 +41115,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
 ; GFX11-FAKE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -42506,12 +42485,12 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
@@ -42543,75 +42522,70 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, s22
-; SI-NEXT:    v_mov_b32_e32 v46, v44
-; SI-NEXT:    v_cvt_f32_f16_e32 v62, v44
-; SI-NEXT:    v_mov_b32_e32 v44, v58
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, s24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s17
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, s26
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, s24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, s18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, s19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, s20
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, s28
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, s26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, s21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v63, s23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, s25
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, s29
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, s28
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, s27
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, v19
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, s29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v25
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v28
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v57
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v46
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v4
-; SI-NEXT:    v_mov_b32_e32 v57, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT:    v_mov_b32_e32 v56, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v62, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v44
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_mov_b32_e32 v47, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v6
-; SI-NEXT:    v_mov_b32_e32 v45, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v36
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v7
@@ -42664,24 +42638,6 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:  .LBB57_2:
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    v_mov_b32_e32 v57, v48
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    v_mov_b32_e32 v56, v32
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    v_mov_b32_e32 v47, v38
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    v_mov_b32_e32 v46, v44
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    v_mov_b32_e32 v45, v36
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    v_mov_b32_e32 v44, v58
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
@@ -42748,12 +42704,24 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:  .LBB57_3: ; %Flow
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v58, v62
 ; SI-NEXT:    v_mov_b32_e32 v62, v32
 ; SI-NEXT:    v_mov_b32_e32 v32, v37
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v37, v39
 ; SI-NEXT:    v_mov_b32_e32 v39, v51
 ; SI-NEXT:    v_mov_b32_e32 v51, v53
@@ -42764,12 +42732,6 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v42, vcc, 3, v43
-; SI-NEXT:    v_add_i32_e32 v43, vcc, 3, v44
-; SI-NEXT:    v_add_i32_e32 v44, vcc, 3, v45
-; SI-NEXT:    v_add_i32_e32 v45, vcc, 3, v46
-; SI-NEXT:    v_add_i32_e32 v46, vcc, 3, v47
-; SI-NEXT:    v_add_i32_e32 v47, vcc, 3, v56
-; SI-NEXT:    v_add_i32_e32 v56, vcc, 3, v57
 ; SI-NEXT:    s_add_i32 s22, s22, 3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, s22
 ; SI-NEXT:    s_add_i32 s24, s24, 3
@@ -42779,23 +42741,29 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, s24
 ; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
+; SI-NEXT:    v_add_i32_e32 v44, vcc, 3, v44
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, s26
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    v_add_i32_e32 v45, vcc, 3, v45
+; SI-NEXT:    v_add_i32_e32 v46, vcc, 3, v46
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, s28
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s17, s17, 3
+; SI-NEXT:    v_add_i32_e32 v47, vcc, 3, v47
+; SI-NEXT:    v_add_i32_e32 v56, vcc, 3, v56
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, s29
+; SI-NEXT:    v_add_i32_e32 v57, vcc, 3, v57
+; SI-NEXT:    s_add_i32 s27, s27, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s17
@@ -42807,6 +42775,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, s25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, s27
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v57
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v46
@@ -42815,9 +42784,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v42
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_add_i32_e32 v57, vcc, 3, v1
+; SI-NEXT:    v_add_i32_e32 v43, vcc, 3, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v57
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
@@ -43683,16 +43651,13 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v10, 16, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v11, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s23, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s22, s14
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v8, 16, v3
@@ -43700,9 +43665,9 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s21
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s20
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v11, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s15, s12
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s14, s11
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
@@ -43716,9 +43681,9 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, s8, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s17
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, s7, 3 op_sel_hi:[1,0]
@@ -43747,14 +43712,14 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, s2, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s3, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v53.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v54.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v7
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v28
@@ -43771,13 +43736,16 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v53.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v55.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v67.l
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
 ; GFX11-TRUE16-NEXT:  .LBB57_3:
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
@@ -43800,8 +43768,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, s2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, s45
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, s44
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, s44
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, s43
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, s42
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, s41
@@ -43858,8 +43826,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v26
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v48, 16, v27
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v22, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v21, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v22, 16, v18
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v19, 16, v0
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v20, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v23, 16, v2
@@ -45247,7 +45215,7 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v56
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v42, s25
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -45257,26 +45225,25 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB59_3
 ; SI-NEXT:  .LBB59_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v44
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
 ; SI-NEXT:    v_add_f32_e32 v44, 0x38000000, v44
-; SI-NEXT:    v_add_f32_e32 v43, 0x38000000, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, v43
+; SI-NEXT:    v_add_f32_e32 v43, 0x38000000, v43
 ; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v45
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
 ; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v42
+; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v44
 ; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
 ; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, v55
@@ -45448,12 +45415,13 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v22
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v29
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v43
-; SI-NEXT:    v_or_b32_e32 v26, v26, v44
+; SI-NEXT:    v_or_b32_e32 v26, v26, v45
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v30
 ; SI-NEXT:    v_or_b32_e32 v20, v20, v41
 ; SI-NEXT:    v_or_b32_e32 v49, v49, v46
@@ -45465,8 +45433,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v56
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v28
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v57
-; SI-NEXT:    v_alignbit_b32 v45, v24, v43, 16
-; SI-NEXT:    v_alignbit_b32 v43, v25, v44, 16
+; SI-NEXT:    v_alignbit_b32 v44, v24, v43, 16
+; SI-NEXT:    v_alignbit_b32 v43, v25, v45, 16
 ; SI-NEXT:    v_alignbit_b32 v42, v19, v30, 16
 ; SI-NEXT:    v_alignbit_b32 v30, v50, v41, 16
 ; SI-NEXT:    v_alignbit_b32 v41, v48, v46, 16
@@ -45480,7 +45448,7 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_alignbit_b32 v28, v5, v57, 16
 ; SI-NEXT:  .LBB59_3: ; %end
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v44
 ; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v44
@@ -46117,16 +46085,13 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v10, 16, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v11, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s23, s15
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s22, s14
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v8, 16, v3
@@ -46134,9 +46099,9 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s21
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s20
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v11, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s15, s12
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s14, s11
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
@@ -46150,9 +46115,9 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, s8 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s17
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, s7 op_sel_hi:[0,1]
@@ -46181,14 +46146,14 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, s2 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s3 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v53.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v54.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v7
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v28
@@ -46205,13 +46170,16 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v53.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v55.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v67.l
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
 ; GFX11-TRUE16-NEXT:  .LBB59_3:
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
@@ -46234,8 +46202,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, s2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, s45
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, s44
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, s44
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, s43
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, s42
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, s41
@@ -46292,8 +46260,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v26
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v48, 16, v27
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v22, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v21, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v22, 16, v18
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v19, 16, v0
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v20, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v23, 16, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 6c2d2b33a04b1..462e50ac8412c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -4447,7 +4447,7 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v7
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v8
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s90, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -4475,11 +4475,11 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s78
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s90
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB13_3
 ; GFX11-TRUE16-NEXT:  .LBB13_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -4534,18 +4534,16 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB13_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, s90
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s89
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s78
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, s88
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s79
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s78
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s89
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s88
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s79
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s78
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s77
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s76
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s75
@@ -4586,10 +4584,10 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB13_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr90_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr89_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr79_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr77_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr76_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr75_lo16
@@ -5634,8 +5632,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v46
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v47
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v56
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
-; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v58
+; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v57
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -5662,8 +5660,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v38
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v37
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT:    v_perm_b32 v1, v62, v58, s6
-; GFX9-NEXT:    v_perm_b32 v2, v35, v57, s6
+; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
+; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v34, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v33, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v5, v61, v46, s6
@@ -5713,8 +5711,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr61
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr62
+; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
@@ -5804,8 +5802,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT:    v_perm_b32 v1, v62, v58, s6
-; GFX9-NEXT:    v_perm_b32 v2, v35, v57, s6
+; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
+; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v34, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v33, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v5, v61, v46, s6
@@ -9482,7 +9480,7 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v7
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v8
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s90, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB17_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -9510,11 +9508,11 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s78
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s90
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB17_3
 ; GFX11-TRUE16-NEXT:  .LBB17_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -9569,18 +9567,16 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB17_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, s90
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s89
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s78
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, s88
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s79
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s78
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s89
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s88
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s79
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s78
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s77
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s76
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s75
@@ -9621,10 +9617,10 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB17_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr90_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr89_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr79_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr77_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr76_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr75_lo16
@@ -11342,15 +11338,15 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v14
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
@@ -11443,9 +11439,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v63, v2
+; SI-NEXT:    v_mov_b32_e32 v49, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_mov_b32_e32 v49, v3
+; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -11454,9 +11450,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
-; SI-NEXT:    v_mov_b32_e32 v48, v11
+; SI-NEXT:    v_mov_b32_e32 v39, v11
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
-; SI-NEXT:    v_mov_b32_e32 v35, v10
+; SI-NEXT:    v_mov_b32_e32 v33, v10
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
 ; SI-NEXT:    v_or_b32_e32 v5, v8, v5
@@ -11466,7 +11462,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v60
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v34
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
@@ -11481,9 +11477,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v40, v56
 ; SI-NEXT:    v_or_b32_e32 v10, v56, v10
 ; SI-NEXT:    v_or_b32_e32 v11, v45, v11
-; SI-NEXT:    v_or_b32_e32 v12, v33, v12
-; SI-NEXT:    v_or_b32_e32 v13, v38, v13
-; SI-NEXT:    v_or_b32_e32 v14, v36, v14
+; SI-NEXT:    v_or_b32_e32 v12, v38, v12
+; SI-NEXT:    v_or_b32_e32 v13, v36, v13
+; SI-NEXT:    v_or_b32_e32 v14, v35, v14
 ; SI-NEXT:    v_or_b32_e32 v15, v32, v15
 ; SI-NEXT:    v_or_b32_e32 v17, v37, v17
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
@@ -11545,10 +11541,10 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11558,7 +11554,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v48
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
@@ -11576,9 +11572,9 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v32
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
@@ -11691,7 +11687,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -11831,10 +11827,10 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB19_4:
-; SI-NEXT:    v_mov_b32_e32 v48, v11
-; SI-NEXT:    v_mov_b32_e32 v35, v10
-; SI-NEXT:    v_mov_b32_e32 v63, v2
-; SI-NEXT:    v_mov_b32_e32 v49, v3
+; SI-NEXT:    v_mov_b32_e32 v39, v11
+; SI-NEXT:    v_mov_b32_e32 v33, v10
+; SI-NEXT:    v_mov_b32_e32 v49, v2
+; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_mov_b32_e32 v52, v37
 ; SI-NEXT:    v_mov_b32_e32 v37, v29
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -17252,8 +17248,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v46
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v47
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v56
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
-; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v58
+; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v57
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -17280,8 +17276,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v38
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v37
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT:    v_perm_b32 v1, v62, v58, s6
-; GFX9-NEXT:    v_perm_b32 v2, v35, v57, s6
+; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
+; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v34, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v33, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v5, v61, v46, s6
@@ -17331,8 +17327,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr61
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr62
+; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
@@ -17422,8 +17418,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT:    v_perm_b32 v1, v62, v58, s6
-; GFX9-NEXT:    v_perm_b32 v2, v35, v57, s6
+; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
+; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v34, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v33, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v5, v61, v46, s6
@@ -20133,13 +20129,13 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    s_lshr_b32 s4, s9, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, s4
 ; SI-NEXT:    s_lshr_b32 s4, s6, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, s4
 ; SI-NEXT:    s_lshr_b32 s4, s7, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, s4
 ; SI-NEXT:    s_lshr_b32 s4, s8, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, s4
 ; SI-NEXT:    s_lshr_b32 s4, s10, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, s4
 ; SI-NEXT:    s_lshr_b32 s4, s11, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, s4
 ; SI-NEXT:    s_lshr_b32 s4, s12, 16
@@ -20214,98 +20210,83 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, s22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, s21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, s20
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, s19
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, s18
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, s17
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, s16
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s19
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s18
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s17
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s16
 ; SI-NEXT:    s_cbranch_execnz .LBB33_3
 ; SI-NEXT:  .LBB33_2: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e64 v1, s16, 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e64 v3, s18, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; SI-NEXT:    v_add_f32_e64 v14, s11, 1.0
 ; SI-NEXT:    v_add_f32_e64 v36, s6, 1.0
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e64 v9, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v15, s24, 1.0
-; SI-NEXT:    v_add_f32_e64 v22, s27, 1.0
-; SI-NEXT:    v_add_f32_e64 v29, s43, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v22
-; SI-NEXT:    v_add_f32_e64 v26, s29, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e64 v10, s8, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v26
+; SI-NEXT:    v_add_f32_e64 v26, s29, 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v26
+; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v53
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v44
-; SI-NEXT:    v_add_f32_e64 v48, s9, 1.0
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v31
 ; SI-NEXT:    v_add_f32_e64 v12, s10, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v50
-; SI-NEXT:    v_add_f32_e64 v7, s20, 1.0
-; SI-NEXT:    v_add_f32_e64 v13, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v20, s26, 1.0
-; SI-NEXT:    v_add_f32_e64 v25, s41, 1.0
-; SI-NEXT:    v_add_f32_e64 v23, s40, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v40
-; SI-NEXT:    v_mov_b32_e32 v40, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v46
+; SI-NEXT:    v_add_f32_e64 v33, s7, 1.0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v3, s18, 1.0
 ; SI-NEXT:    v_add_f32_e64 v5, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v7, s20, 1.0
+; SI-NEXT:    v_add_f32_e64 v9, s21, 1.0
 ; SI-NEXT:    v_add_f32_e64 v11, s22, 1.0
+; SI-NEXT:    v_add_f32_e64 v13, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v15, s24, 1.0
 ; SI-NEXT:    v_add_f32_e64 v18, s25, 1.0
+; SI-NEXT:    v_add_f32_e64 v20, s26, 1.0
+; SI-NEXT:    v_add_f32_e64 v22, s27, 1.0
 ; SI-NEXT:    v_add_f32_e64 v24, s28, 1.0
+; SI-NEXT:    v_add_f32_e64 v29, s43, 1.0
 ; SI-NEXT:    v_add_f32_e64 v27, s42, 1.0
+; SI-NEXT:    v_add_f32_e64 v25, s41, 1.0
+; SI-NEXT:    v_add_f32_e64 v23, s40, 1.0
 ; SI-NEXT:    v_add_f32_e64 v21, s15, 1.0
 ; SI-NEXT:    v_add_f32_e64 v19, s14, 1.0
 ; SI-NEXT:    v_add_f32_e64 v17, s13, 1.0
 ; SI-NEXT:    v_add_f32_e64 v16, s12, 1.0
-; SI-NEXT:    v_add_f32_e64 v33, s7, 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; SI-NEXT:    v_add_f32_e64 v48, s9, 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v23
 ; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v21
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v17
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v36
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
@@ -20316,40 +20297,50 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v63
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v62
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v61
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v56
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v45
+; SI-NEXT:    v_mov_b32_e32 v45, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v43
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, v6
+; SI-NEXT:    v_mov_b32_e32 v47, v8
+; SI-NEXT:    v_mov_b32_e32 v43, v34
 ; SI-NEXT:  .LBB33_3: ; %end
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v56
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v47
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v46
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v8, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v31
@@ -20360,14 +20351,14 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v8, v6, s[0:3], 0 offen
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v44
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v43
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v3
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 8, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v8, v6
 ; SI-NEXT:    buffer_store_dword v6, v31, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v5
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v8, v6
@@ -20506,26 +20497,26 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v4, v6, v4
 ; SI-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v47
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v59
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x5c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v6, v4
 ; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v45
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v58
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v43
 ; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v57
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v41
 ; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v52
@@ -20559,13 +20550,13 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
-; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr55
@@ -20606,13 +20597,13 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    s_branch .LBB33_2
@@ -22953,15 +22944,15 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v14
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
@@ -23054,9 +23045,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v63, v2
+; SI-NEXT:    v_mov_b32_e32 v49, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_mov_b32_e32 v49, v3
+; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -23065,9 +23056,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
-; SI-NEXT:    v_mov_b32_e32 v48, v11
+; SI-NEXT:    v_mov_b32_e32 v39, v11
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
-; SI-NEXT:    v_mov_b32_e32 v35, v10
+; SI-NEXT:    v_mov_b32_e32 v33, v10
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
 ; SI-NEXT:    v_or_b32_e32 v5, v8, v5
@@ -23077,7 +23068,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v60
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v34
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
@@ -23092,9 +23083,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v40, v56
 ; SI-NEXT:    v_or_b32_e32 v10, v56, v10
 ; SI-NEXT:    v_or_b32_e32 v11, v45, v11
-; SI-NEXT:    v_or_b32_e32 v12, v33, v12
-; SI-NEXT:    v_or_b32_e32 v13, v38, v13
-; SI-NEXT:    v_or_b32_e32 v14, v36, v14
+; SI-NEXT:    v_or_b32_e32 v12, v38, v12
+; SI-NEXT:    v_or_b32_e32 v13, v36, v13
+; SI-NEXT:    v_or_b32_e32 v14, v35, v14
 ; SI-NEXT:    v_or_b32_e32 v15, v32, v15
 ; SI-NEXT:    v_or_b32_e32 v17, v37, v17
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
@@ -23156,10 +23147,10 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -23169,7 +23160,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v48
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
@@ -23187,9 +23178,9 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v32
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
@@ -23302,7 +23293,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -23442,10 +23433,10 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB35_4:
-; SI-NEXT:    v_mov_b32_e32 v48, v11
-; SI-NEXT:    v_mov_b32_e32 v35, v10
-; SI-NEXT:    v_mov_b32_e32 v63, v2
-; SI-NEXT:    v_mov_b32_e32 v49, v3
+; SI-NEXT:    v_mov_b32_e32 v39, v11
+; SI-NEXT:    v_mov_b32_e32 v33, v10
+; SI-NEXT:    v_mov_b32_e32 v49, v2
+; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_mov_b32_e32 v52, v37
 ; SI-NEXT:    v_mov_b32_e32 v37, v29
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -26829,7 +26820,7 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v7
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v8
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s90, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB41_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -26857,11 +26848,11 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s78
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s90
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB41_3
 ; GFX11-TRUE16-NEXT:  .LBB41_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s6, s6, 3
@@ -26916,18 +26907,16 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB41_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, s90
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s89
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s78
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, s88
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s79
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s78
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s89
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s88
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s79
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s78
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s77
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s76
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s75
@@ -26968,10 +26957,10 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB41_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr90_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr89_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr79_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr77_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr76_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr75_lo16
@@ -28016,8 +28005,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v46
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v47
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v56
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
-; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v58
+; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v57
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -28044,8 +28033,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v38
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v37
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT:    v_perm_b32 v1, v62, v58, s6
-; GFX9-NEXT:    v_perm_b32 v2, v35, v57, s6
+; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
+; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v34, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v33, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v5, v61, v46, s6
@@ -28095,8 +28084,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr61
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr62
+; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
@@ -28186,8 +28175,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT:    v_perm_b32 v1, v62, v58, s6
-; GFX9-NEXT:    v_perm_b32 v2, v35, v57, s6
+; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
+; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v34, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v33, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v5, v61, v46, s6
@@ -31878,7 +31867,7 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v7
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v8
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s90, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB45_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -31906,11 +31895,11 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s78
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s90
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB45_3
 ; GFX11-TRUE16-NEXT:  .LBB45_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s6, s6, 3
@@ -31965,18 +31954,16 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s17, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB45_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, s90
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s89
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s78
-; GFX11-TRUE16-NEXT:    s_mov_b32 s78, s88
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s79
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s78
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s89
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s88
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s79
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s78
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s77
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s76
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s75
@@ -32017,10 +32004,10 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB45_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr90_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr89_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr79_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr77_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr76_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr75_lo16
@@ -33738,15 +33725,15 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v14
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
@@ -33839,9 +33826,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v63, v2
+; SI-NEXT:    v_mov_b32_e32 v49, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_mov_b32_e32 v49, v3
+; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -33850,9 +33837,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
-; SI-NEXT:    v_mov_b32_e32 v48, v11
+; SI-NEXT:    v_mov_b32_e32 v39, v11
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
-; SI-NEXT:    v_mov_b32_e32 v35, v10
+; SI-NEXT:    v_mov_b32_e32 v33, v10
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
 ; SI-NEXT:    v_or_b32_e32 v5, v8, v5
@@ -33862,7 +33849,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v60
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v34
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
@@ -33877,9 +33864,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v40, v56
 ; SI-NEXT:    v_or_b32_e32 v10, v56, v10
 ; SI-NEXT:    v_or_b32_e32 v11, v45, v11
-; SI-NEXT:    v_or_b32_e32 v12, v33, v12
-; SI-NEXT:    v_or_b32_e32 v13, v38, v13
-; SI-NEXT:    v_or_b32_e32 v14, v36, v14
+; SI-NEXT:    v_or_b32_e32 v12, v38, v12
+; SI-NEXT:    v_or_b32_e32 v13, v36, v13
+; SI-NEXT:    v_or_b32_e32 v14, v35, v14
 ; SI-NEXT:    v_or_b32_e32 v15, v32, v15
 ; SI-NEXT:    v_or_b32_e32 v17, v37, v17
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
@@ -33941,10 +33928,10 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -33954,7 +33941,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v48
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
@@ -33972,9 +33959,9 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v32
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
@@ -34087,7 +34074,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -34227,10 +34214,10 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB47_4:
-; SI-NEXT:    v_mov_b32_e32 v48, v11
-; SI-NEXT:    v_mov_b32_e32 v35, v10
-; SI-NEXT:    v_mov_b32_e32 v63, v2
-; SI-NEXT:    v_mov_b32_e32 v49, v3
+; SI-NEXT:    v_mov_b32_e32 v39, v11
+; SI-NEXT:    v_mov_b32_e32 v33, v10
+; SI-NEXT:    v_mov_b32_e32 v49, v2
+; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_mov_b32_e32 v52, v37
 ; SI-NEXT:    v_mov_b32_e32 v37, v29
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -37825,8 +37812,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v46
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v47
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v56
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
-; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v58
+; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v57
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -37853,8 +37840,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v38
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v37
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT:    v_perm_b32 v1, v62, v58, s6
-; GFX9-NEXT:    v_perm_b32 v2, v35, v57, s6
+; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
+; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v34, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v33, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v5, v61, v46, s6
@@ -37904,8 +37891,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr61
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr62
+; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
@@ -37995,8 +37982,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
-; GFX9-NEXT:    v_perm_b32 v1, v62, v58, s6
-; GFX9-NEXT:    v_perm_b32 v2, v35, v57, s6
+; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
+; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v34, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v33, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v5, v61, v46, s6
@@ -39543,18 +39530,18 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    ; implicit-def: $vgpr60
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; kill: killed $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -39572,13 +39559,13 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v29
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v60, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v29
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, v29
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v29
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v29
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
@@ -39603,8 +39590,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v27
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
@@ -39645,7 +39632,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v29
 ; SI-NEXT:    v_mov_b32_e32 v29, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v26
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -39689,35 +39676,41 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB52_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[54:55], v[1:2], 1.0
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
 ; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v19
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v47
 ; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v18
+; SI-NEXT:    v_add_f64 v[54:55], v[1:2], 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v18
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v45
+; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v17
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v43
+; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v16
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v42
+; SI-NEXT:    v_mov_b32_e32 v42, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
 ; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
 ; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
 ; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
 ; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
 ; SI-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
 ; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v55
@@ -39733,11 +39726,9 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v22
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v24
 ; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v25
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v26
@@ -39778,10 +39769,6 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v59, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, v57
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v47
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v44
 ; SI-NEXT:    v_cvt_f32_f16_e32 v46, v46
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, v56
@@ -39796,11 +39783,11 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v47, v26
+; SI-NEXT:    v_mov_b32_e32 v45, v27
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v1
-; SI-NEXT:    v_mov_b32_e32 v45, v26
-; SI-NEXT:    v_mov_b32_e32 v43, v27
-; SI-NEXT:    v_mov_b32_e32 v42, v28
+; SI-NEXT:    v_mov_b32_e32 v43, v28
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -39818,7 +39805,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB52_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
@@ -39985,7 +39972,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v47
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -39994,7 +39981,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v59
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -40003,7 +39990,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v60
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -40012,7 +39999,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -40039,21 +40026,21 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v47
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v43
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v43
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -40651,24 +40638,23 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s42, s5, 16
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v60, s42
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, s42
 ; SI-NEXT:    s_lshr_b32 s42, s4, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, s42
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, s42
 ; SI-NEXT:    s_lshr_b32 s42, s7, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s42
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, s42
 ; SI-NEXT:    s_lshr_b32 s42, s6, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s42
 ; SI-NEXT:    s_lshr_b32 s42, s9, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, s42
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, s42
 ; SI-NEXT:    s_lshr_b32 s42, s8, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, s42
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s42
 ; SI-NEXT:    s_lshr_b32 s42, s11, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, s42
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, s42
 ; SI-NEXT:    s_lshr_b32 s42, s10, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, s42
 ; SI-NEXT:    s_lshr_b32 s42, s13, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, s42
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, s42
 ; SI-NEXT:    s_lshr_b32 s42, s12, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, s42
 ; SI-NEXT:    s_lshr_b32 s42, s15, 16
@@ -40707,12 +40693,16 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v46, s42
 ; SI-NEXT:    s_lshr_b32 s42, s16, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, s42
+; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, s4
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, s7
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, s6
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, s9
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, s8
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, s7
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, s6
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, s9
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, s8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, s11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, s10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, s13
@@ -40749,23 +40739,14 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v43
+; SI-NEXT:    v_add_f64 v[22:23], s[14:15], 1.0
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_f64 v[3:4], s[4:5], 1.0
-; SI-NEXT:    v_add_f64 v[22:23], s[14:15], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v59
-; SI-NEXT:    v_add_f64 v[18:19], s[12:13], 1.0
-; SI-NEXT:    v_add_f64 v[7:8], s[6:7], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v60
-; SI-NEXT:    v_mov_b32_e32 v60, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v43
@@ -40773,12 +40754,13 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    v_add_f64 v[49:50], s[22:23], 1.0
 ; SI-NEXT:    v_add_f64 v[37:38], s[24:25], 1.0
 ; SI-NEXT:    v_add_f64 v[15:16], s[10:11], 1.0
+; SI-NEXT:    v_add_f64 v[7:8], s[6:7], 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v49
 ; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v37
 ; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v38
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v58, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v39
@@ -40792,6 +40774,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v46
 ; SI-NEXT:    v_add_f64 v[30:31], s[28:29], 1.0
 ; SI-NEXT:    v_add_f64 v[26:27], s[40:41], 1.0
+; SI-NEXT:    v_add_f64 v[18:19], s[12:13], 1.0
 ; SI-NEXT:    v_add_f64 v[11:12], s[8:9], 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v50
 ; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v35
@@ -40801,16 +40784,20 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v26
 ; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v27
 ; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v60, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v26
@@ -40824,14 +40811,16 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v26, v63
@@ -40839,11 +40828,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, v62
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, v61
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
@@ -40855,193 +40844,193 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:  .LBB53_3: ; %end
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v56
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v47
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v46
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v45
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
 ; SI-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v6, v17, v6
+; SI-NEXT:    v_or_b32_e32 v6, v14, v6
 ; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v44
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v43
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 8, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 8, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v42
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v41
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 12, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v55
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v40
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 16, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v53
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v54
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 20, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v52
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v50
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 28, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 28, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v15
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 32, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 32, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v38
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 36, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 36, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v36
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 40, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 40, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v32
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v3
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 44, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 44, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v30
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v12
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 48, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 48, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v28
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v31
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 52, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 52, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v11
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 56, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 56, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v27
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 60, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v4
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 64, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v23
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x44, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v23
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x44, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v8
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x48, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v59
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v19
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x4c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v19
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x4c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v7
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x50, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v16
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x54, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x54, v0
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v48
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x58, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x58, v0
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v34
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x5c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v5, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v59
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x5c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v33
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x60, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v58
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v34
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v33
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v57
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -41105,23 +41094,23 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    ; implicit-def: $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr60
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr60
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    s_branch .LBB53_2
 ;
 ; VI-LABEL: bitcast_v14f64_to_v56f16_scalar:
@@ -43430,15 +43419,15 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v14
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
@@ -43531,9 +43520,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v63, v2
+; SI-NEXT:    v_mov_b32_e32 v49, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_mov_b32_e32 v49, v3
+; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -43542,9 +43531,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
-; SI-NEXT:    v_mov_b32_e32 v48, v11
+; SI-NEXT:    v_mov_b32_e32 v39, v11
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
-; SI-NEXT:    v_mov_b32_e32 v35, v10
+; SI-NEXT:    v_mov_b32_e32 v33, v10
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
 ; SI-NEXT:    v_or_b32_e32 v5, v8, v5
@@ -43554,7 +43543,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v60
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v63
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v34
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
@@ -43569,9 +43558,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v40, v56
 ; SI-NEXT:    v_or_b32_e32 v10, v56, v10
 ; SI-NEXT:    v_or_b32_e32 v11, v45, v11
-; SI-NEXT:    v_or_b32_e32 v12, v33, v12
-; SI-NEXT:    v_or_b32_e32 v13, v38, v13
-; SI-NEXT:    v_or_b32_e32 v14, v36, v14
+; SI-NEXT:    v_or_b32_e32 v12, v38, v12
+; SI-NEXT:    v_or_b32_e32 v13, v36, v13
+; SI-NEXT:    v_or_b32_e32 v14, v35, v14
 ; SI-NEXT:    v_or_b32_e32 v15, v32, v15
 ; SI-NEXT:    v_or_b32_e32 v17, v37, v17
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
@@ -43633,10 +43622,10 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -43646,7 +43635,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v48
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
@@ -43664,9 +43653,9 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v32
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
@@ -43779,7 +43768,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v63
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -43919,10 +43908,10 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB55_4:
-; SI-NEXT:    v_mov_b32_e32 v48, v11
-; SI-NEXT:    v_mov_b32_e32 v35, v10
-; SI-NEXT:    v_mov_b32_e32 v63, v2
-; SI-NEXT:    v_mov_b32_e32 v49, v3
+; SI-NEXT:    v_mov_b32_e32 v39, v11
+; SI-NEXT:    v_mov_b32_e32 v33, v10
+; SI-NEXT:    v_mov_b32_e32 v49, v2
+; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_mov_b32_e32 v52, v37
 ; SI-NEXT:    v_mov_b32_e32 v37, v29
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
@@ -46189,8 +46178,9 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32
 ; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(4)
@@ -46236,7 +46226,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_2
@@ -46262,35 +46252,33 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v57
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s22
-; SI-NEXT:    v_mov_b32_e32 v57, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v36
-; SI-NEXT:    v_mov_b32_e32 v56, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v35
+; SI-NEXT:    v_mov_b32_e32 v47, v34
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s23
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v35
-; SI-NEXT:    v_mov_b32_e32 v47, v34
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v31
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s24
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v62
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v61
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s25
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v61
 ; SI-NEXT:    v_cvt_f32_f16_e32 v37, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v58
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s26
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v58
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s27
@@ -46374,12 +46362,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:  .LBB57_2:
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; kill: killed $vgpr33
-; SI-NEXT:    v_mov_b32_e32 v57, v36
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; kill: killed $vgpr33
-; SI-NEXT:    v_mov_b32_e32 v56, v35
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; kill: killed $vgpr33
 ; SI-NEXT:    v_mov_b32_e32 v47, v34
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; kill: killed $vgpr33
@@ -46467,6 +46449,10 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    ; kill: killed $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; kill: killed $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; kill: killed $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; kill: killed $vgpr33
 ; SI-NEXT:  .LBB57_3: ; %Flow
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -46484,8 +46470,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v46, vcc, 3, v47
-; SI-NEXT:    v_add_i32_e32 v47, vcc, 3, v56
-; SI-NEXT:    v_add_i32_e32 v56, vcc, 3, v57
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s16
 ; SI-NEXT:    s_add_i32 s18, s18, 3
@@ -46528,38 +46512,40 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s25
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
-; SI-NEXT:    s_add_i32 s19, s19, 3
+; SI-NEXT:    v_add_i32_e32 v56, vcc, 3, v56
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, s26
+; SI-NEXT:    v_add_i32_e32 v57, vcc, 3, v57
+; SI-NEXT:    s_add_i32 s19, s19, 3
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, s27
 ; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, s17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, s19
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, s27
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, s28
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v57
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v56
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v46
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, s28
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, s29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, v32
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, v63
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, s29
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v60
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v58
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v62
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v61
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v58
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_add_i32_e32 v57, vcc, 3, v1
+; SI-NEXT:    v_add_i32_e32 v47, vcc, 3, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v47
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
@@ -47494,31 +47480,26 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v10, 16, v6
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v13, 16, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v13, 16, v5
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v10, 16, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v11, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v12, 16, v1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v13, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v11, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v12, 16, v1
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s21
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s20
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v10, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v13, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s14, s10
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s13, s9
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, s10, 3 op_sel_hi:[1,0]
@@ -47532,11 +47513,11 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v4
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s8, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s17
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, s7, 3 op_sel_hi:[1,0]
@@ -47565,16 +47546,16 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v30, s2, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v33, s3, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v54.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v55.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v66.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v31
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
@@ -47593,11 +47574,16 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v55.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v71.l
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
 ; GFX11-TRUE16-NEXT:  .LBB57_3:
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
@@ -50160,31 +50146,26 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v10, 16, v6
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v13, 16, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v13, 16, v5
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v10, 16, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v11, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v12, 16, v1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v13, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v11, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v12, 16, v1
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s21
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s20
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v10, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v13, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s14, s10
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s13, s9
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, s10 op_sel_hi:[0,1]
@@ -50198,11 +50179,11 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v4
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s8 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s17
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, s7 op_sel_hi:[0,1]
@@ -50231,16 +50212,16 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v30, 0x200, s2 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v33, 0x200, s3 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v54.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v55.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v66.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v31
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
@@ -50259,11 +50240,16 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v55.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v71.l
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
 ; GFX11-TRUE16-NEXT:  .LBB59_3:
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_2

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index a5bfff7b0e5d3..94ed6276bd051 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -4738,7 +4738,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v9
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v10
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v11
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s94, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB13_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -4767,12 +4767,12 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s19, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s94, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s89
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s94
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB13_3
 ; GFX11-TRUE16-NEXT:  .LBB13_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -4830,22 +4830,19 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s19, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s94, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB13_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s94
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s93
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s89
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s92
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s91
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s89
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s90
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s88
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s93
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s92
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s91
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s90
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s89
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s88
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s79
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s19, s19, s78
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s20, s20, s77
@@ -4887,11 +4884,11 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB13_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr94_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr93_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr92_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr91_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr90_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr89_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr79_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
@@ -6009,8 +6006,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v45
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v46
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v47
-; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 16, v56
+; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 16, v47
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v59
@@ -6045,8 +6042,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v36, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT:    v_perm_b32 v5, v63, v56, s6
-; GFX9-NEXT:    v_perm_b32 v6, v34, v47, s6
+; GFX9-NEXT:    v_perm_b32 v5, v34, v56, s6
+; GFX9-NEXT:    v_perm_b32 v6, v63, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v7, v33, v46, s6
 ; GFX9-NEXT:    v_perm_b32 v8, v32, v45, s6
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
@@ -6088,8 +6085,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr39
 ; GFX9-NEXT:    ; implicit-def: $vgpr32
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    ; implicit-def: $vgpr63
+; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr62
@@ -6203,8 +6200,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v36, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT:    v_perm_b32 v5, v63, v56, s6
-; GFX9-NEXT:    v_perm_b32 v6, v34, v47, s6
+; GFX9-NEXT:    v_perm_b32 v5, v34, v56, s6
+; GFX9-NEXT:    v_perm_b32 v6, v63, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v7, v33, v46, s6
 ; GFX9-NEXT:    v_perm_b32 v8, v32, v45, s6
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
@@ -7751,43 +7748,39 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
@@ -7810,12 +7803,10 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr38
@@ -7825,18 +7816,20 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
@@ -7856,14 +7849,18 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB16_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
@@ -7873,57 +7870,52 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v32
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v5
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v32
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
@@ -7931,6 +7923,9 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
@@ -7946,42 +7941,43 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, v26
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, v25
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, v1
@@ -8019,61 +8015,37 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB16_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v1
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v37
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v22
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v22
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v35
 ; SI-NEXT:    v_add_i32_e32 v21, vcc, 3, v21
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v34
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 3, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v20
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v32
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 3, v19
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
+; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v23
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v39
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v37
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v63
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v18
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v61
-; SI-NEXT:    v_mov_b32_e32 v61, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v43
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v47
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v7
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v9
@@ -8084,15 +8056,20 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 3, v14
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 3, v15
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
+; SI-NEXT:    v_add_i32_e32 v17, vcc, 3, v17
+; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v18
+; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, 3, v25
 ; SI-NEXT:    v_add_i32_e32 v26, vcc, 3, v26
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 3, v27
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v28
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
@@ -8102,12 +8079,15 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
 ; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v26
 ; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v27
 ; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
@@ -8127,24 +8107,27 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v55
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v61
+; SI-NEXT:    v_mov_b32_e32 v61, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v62
@@ -8154,41 +8137,46 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v45
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v57
-; SI-NEXT:    v_mov_b32_e32 v39, v28
-; SI-NEXT:    v_mov_b32_e32 v37, v29
-; SI-NEXT:    v_mov_b32_e32 v35, v30
-; SI-NEXT:    v_mov_b32_e32 v63, v26
-; SI-NEXT:    v_mov_b32_e32 v59, v27
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v57
+; SI-NEXT:    v_mov_b32_e32 v37, v27
+; SI-NEXT:    v_mov_b32_e32 v35, v28
+; SI-NEXT:    v_mov_b32_e32 v34, v29
+; SI-NEXT:    v_mov_b32_e32 v32, v30
+; SI-NEXT:    v_mov_b32_e32 v63, v25
+; SI-NEXT:    v_mov_b32_e32 v59, v26
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB16_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v56
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v46
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -8207,39 +8195,43 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
@@ -8248,7 +8240,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
@@ -8257,7 +8249,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
@@ -8266,7 +8258,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
@@ -8275,7 +8267,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
@@ -8284,7 +8276,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
@@ -8293,7 +8285,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v62
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
@@ -8302,7 +8294,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v60
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -8311,7 +8303,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v58
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
@@ -8321,8 +8313,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -8332,8 +8324,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -8343,8 +8335,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -8354,8 +8346,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -8365,8 +8357,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -8376,8 +8368,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -8386,66 +8378,62 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v61
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v39
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v63
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v63
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v59
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v37
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v35
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v34
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -10189,7 +10177,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v9
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v10
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v11
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s94, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB17_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -10218,12 +10206,12 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s19, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s94, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s89
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s94
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB17_3
 ; GFX11-TRUE16-NEXT:  .LBB17_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 3
@@ -10281,22 +10269,19 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s19, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s94, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB17_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s94
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s93
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s89
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s92
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s91
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s89
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s90
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s88
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s93
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s92
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s91
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s90
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s89
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s88
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s79
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s19, s19, s78
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s20, s20, s77
@@ -10338,11 +10323,11 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB17_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr94_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr93_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr92_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr91_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr90_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr89_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr79_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
@@ -12234,9 +12219,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, s27
@@ -12315,9 +12300,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v10, v2
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
@@ -12345,13 +12330,13 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
-; SI-NEXT:    v_or_b32_e32 v3, v11, v3
+; SI-NEXT:    v_or_b32_e32 v2, v11, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v63
@@ -12516,7 +12501,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -12538,7 +12523,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    s_waitcnt vmcnt(11)
@@ -18572,8 +18557,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v45
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v46
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v47
-; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 16, v56
+; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 16, v47
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v59
@@ -18608,8 +18593,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v36, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT:    v_perm_b32 v5, v63, v56, s6
-; GFX9-NEXT:    v_perm_b32 v6, v34, v47, s6
+; GFX9-NEXT:    v_perm_b32 v5, v34, v56, s6
+; GFX9-NEXT:    v_perm_b32 v6, v63, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v7, v33, v46, s6
 ; GFX9-NEXT:    v_perm_b32 v8, v32, v45, s6
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
@@ -18651,8 +18636,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr39
 ; GFX9-NEXT:    ; implicit-def: $vgpr32
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    ; implicit-def: $vgpr63
+; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr62
@@ -18766,8 +18751,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v36, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT:    v_perm_b32 v5, v63, v56, s6
-; GFX9-NEXT:    v_perm_b32 v6, v34, v47, s6
+; GFX9-NEXT:    v_perm_b32 v5, v34, v56, s6
+; GFX9-NEXT:    v_perm_b32 v6, v63, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v7, v33, v46, s6
 ; GFX9-NEXT:    v_perm_b32 v8, v32, v45, s6
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
@@ -20314,43 +20299,39 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
@@ -20373,12 +20354,10 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr38
@@ -20388,18 +20367,20 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
@@ -20419,14 +20400,18 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB32_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
@@ -20436,57 +20421,52 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v32
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v5
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v32
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
@@ -20494,6 +20474,9 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
@@ -20509,42 +20492,43 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, v26
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, v25
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, v1
@@ -20582,61 +20566,37 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB32_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v1
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v37
 ; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
+; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v22
-; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v35
+; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
 ; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v20
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v20
 ; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v19
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v39
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v37
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v63
-; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v18
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v61
-; SI-NEXT:    v_mov_b32_e32 v61, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v43
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v47
 ; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
 ; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
 ; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
 ; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
 ; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
@@ -20647,15 +20607,20 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
 ; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
 ; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
+; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
+; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
+; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
 ; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
 ; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
 ; SI-NEXT:    v_add_f32_e32 v27, 1.0, v27
 ; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
 ; SI-NEXT:    v_add_f32_e32 v29, 1.0, v29
 ; SI-NEXT:    v_add_f32_e32 v30, 1.0, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
@@ -20665,12 +20630,15 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
 ; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v26
 ; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v27
 ; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
@@ -20696,62 +20664,70 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v55
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v61
+; SI-NEXT:    v_mov_b32_e32 v61, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v62
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, v36
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v45
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v57
-; SI-NEXT:    v_mov_b32_e32 v39, v28
-; SI-NEXT:    v_mov_b32_e32 v37, v29
-; SI-NEXT:    v_mov_b32_e32 v35, v30
-; SI-NEXT:    v_mov_b32_e32 v63, v26
-; SI-NEXT:    v_mov_b32_e32 v59, v27
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v57
+; SI-NEXT:    v_mov_b32_e32 v37, v27
+; SI-NEXT:    v_mov_b32_e32 v35, v28
+; SI-NEXT:    v_mov_b32_e32 v34, v29
+; SI-NEXT:    v_mov_b32_e32 v32, v30
+; SI-NEXT:    v_mov_b32_e32 v63, v25
+; SI-NEXT:    v_mov_b32_e32 v59, v26
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB32_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v56
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v46
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -20770,39 +20746,43 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
@@ -20811,7 +20791,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
@@ -20820,7 +20800,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
@@ -20829,7 +20809,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
@@ -20838,7 +20818,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
@@ -20847,7 +20827,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
@@ -20856,7 +20836,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v62
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
@@ -20865,7 +20845,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v60
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -20874,7 +20854,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v58
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
@@ -20884,8 +20864,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -20895,8 +20875,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -20906,8 +20886,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -20917,8 +20897,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -20928,8 +20908,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -20939,8 +20919,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -20949,66 +20929,62 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v61
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v39
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v63
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v63
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v59
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v37
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v35
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v34
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -24776,9 +24752,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, s27
@@ -24857,9 +24833,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v10, v2
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
@@ -24887,13 +24863,13 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
-; SI-NEXT:    v_or_b32_e32 v3, v11, v3
+; SI-NEXT:    v_or_b32_e32 v2, v11, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v63
@@ -25058,7 +25034,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -25080,7 +25056,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    s_waitcnt vmcnt(11)
@@ -28966,7 +28942,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v9
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v10
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v11
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s94, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB41_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -28995,12 +28971,12 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s19, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s94, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s89
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s94
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB41_3
 ; GFX11-TRUE16-NEXT:  .LBB41_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s6, s6, 3
@@ -29058,22 +29034,19 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s19, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s94, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB41_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s94
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s93
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s89
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s92
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s91
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s89
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s90
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s88
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s93
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s92
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s91
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s90
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s89
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s88
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s79
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s19, s19, s78
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s20, s20, s77
@@ -29115,11 +29088,11 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB41_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr94_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr93_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr92_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr91_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr90_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr89_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr79_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
@@ -30237,8 +30210,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v45
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v46
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v47
-; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 16, v56
+; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 16, v47
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v59
@@ -30273,8 +30246,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v36, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT:    v_perm_b32 v5, v63, v56, s6
-; GFX9-NEXT:    v_perm_b32 v6, v34, v47, s6
+; GFX9-NEXT:    v_perm_b32 v5, v34, v56, s6
+; GFX9-NEXT:    v_perm_b32 v6, v63, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v7, v33, v46, s6
 ; GFX9-NEXT:    v_perm_b32 v8, v32, v45, s6
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
@@ -30316,8 +30289,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr39
 ; GFX9-NEXT:    ; implicit-def: $vgpr32
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    ; implicit-def: $vgpr63
+; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr62
@@ -30431,8 +30404,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v36, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT:    v_perm_b32 v5, v63, v56, s6
-; GFX9-NEXT:    v_perm_b32 v6, v34, v47, s6
+; GFX9-NEXT:    v_perm_b32 v5, v34, v56, s6
+; GFX9-NEXT:    v_perm_b32 v6, v63, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v7, v33, v46, s6
 ; GFX9-NEXT:    v_perm_b32 v8, v32, v45, s6
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
@@ -31979,43 +31952,39 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
@@ -32038,11 +32007,9 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr48
@@ -32053,18 +32020,20 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; kill: killed $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; kill: killed $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
@@ -32084,14 +32053,18 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB44_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
@@ -32101,57 +32074,52 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v61, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v32
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v32
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, v32
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
@@ -32159,6 +32127,11 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
@@ -32172,44 +32145,43 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, v26
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, v25
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, v1
@@ -32250,9 +32222,9 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
 ; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v5
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v7
 ; SI-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
@@ -32270,57 +32242,38 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
 ; SI-NEXT:    v_add_i32_e32 v21, vcc, 3, v21
 ; SI-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v37
 ; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v35
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
 ; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v20
+; SI-NEXT:    v_add_i32_e32 v23, vcc, 3, v23
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v34
 ; SI-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v19
+; SI-NEXT:    v_add_i32_e32 v25, vcc, 3, v25
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v39
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v18
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v63
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v43
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v61
-; SI-NEXT:    v_mov_b32_e32 v61, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v47
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 3, v25
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    v_addc_u32_e32 v26, vcc, 0, v26, vcc
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 3, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v1
 ; SI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
+; SI-NEXT:    v_cvt_f32_f16_e32 v56, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v37
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v63
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
 ; SI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
@@ -32331,12 +32284,15 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
 ; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
 ; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v26
 ; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v27
 ; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
@@ -32362,15 +32318,21 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v55
 ; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v61
+; SI-NEXT:    v_mov_b32_e32 v61, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v58
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v62, v62
@@ -32381,43 +32343,45 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v45
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v57
-; SI-NEXT:    v_mov_b32_e32 v39, v28
-; SI-NEXT:    v_mov_b32_e32 v37, v29
-; SI-NEXT:    v_mov_b32_e32 v35, v30
-; SI-NEXT:    v_mov_b32_e32 v63, v26
-; SI-NEXT:    v_mov_b32_e32 v59, v27
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v57
+; SI-NEXT:    v_mov_b32_e32 v37, v27
+; SI-NEXT:    v_mov_b32_e32 v35, v28
+; SI-NEXT:    v_mov_b32_e32 v34, v29
+; SI-NEXT:    v_mov_b32_e32 v32, v30
+; SI-NEXT:    v_mov_b32_e32 v63, v25
+; SI-NEXT:    v_mov_b32_e32 v59, v26
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB44_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v56
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v46
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -32436,30 +32400,34 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
@@ -32468,7 +32436,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
@@ -32477,7 +32445,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
@@ -32486,7 +32454,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v38
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
@@ -32495,7 +32463,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
@@ -32504,7 +32472,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
@@ -32513,7 +32481,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
@@ -32522,7 +32490,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v62
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
@@ -32531,7 +32499,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v60
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -32540,7 +32508,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v58
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
@@ -32550,8 +32518,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32561,8 +32529,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32572,8 +32540,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32583,8 +32551,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32594,8 +32562,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32605,8 +32573,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -32615,66 +32583,62 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v61
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v39
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v63
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v63
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v59
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v37
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v35
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v34
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -34434,7 +34398,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v9
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v10
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v11
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s94, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s40, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB45_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
@@ -34463,12 +34427,12 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s19, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s94, s0, 16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s89
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s94
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB45_3
 ; GFX11-TRUE16-NEXT:  .LBB45_2: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_add_u32 s6, s6, 3
@@ -34526,22 +34490,19 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s19, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s18, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s94, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s0, 16
 ; GFX11-TRUE16-NEXT:  .LBB45_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s94
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s93
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s89
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s92
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s91
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s89
-; GFX11-TRUE16-NEXT:    s_mov_b32 s89, s90
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s88
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s93
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s92
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s91
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s90
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s89
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s88
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s18, s79
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s19, s19, s78
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s20, s20, s77
@@ -34583,11 +34544,11 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB45_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr94_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr93_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr92_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr91_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr90_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr89_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr79_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78_lo16
@@ -36479,9 +36440,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, s27
@@ -36560,9 +36521,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v10, v2
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
@@ -36590,13 +36551,13 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
-; SI-NEXT:    v_or_b32_e32 v3, v11, v3
+; SI-NEXT:    v_or_b32_e32 v2, v11, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v63
@@ -36761,7 +36722,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -36783,7 +36744,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    s_waitcnt vmcnt(11)
@@ -40905,8 +40866,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v45
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v46
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v47
-; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 16, v56
+; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 16, v47
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v57
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v59
@@ -40941,8 +40902,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v36, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT:    v_perm_b32 v5, v63, v56, s6
-; GFX9-NEXT:    v_perm_b32 v6, v34, v47, s6
+; GFX9-NEXT:    v_perm_b32 v5, v34, v56, s6
+; GFX9-NEXT:    v_perm_b32 v6, v63, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v7, v33, v46, s6
 ; GFX9-NEXT:    v_perm_b32 v8, v32, v45, s6
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
@@ -40984,8 +40945,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr39
 ; GFX9-NEXT:    ; implicit-def: $vgpr32
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    ; implicit-def: $vgpr63
+; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
 ; GFX9-NEXT:    ; implicit-def: $vgpr36
 ; GFX9-NEXT:    ; implicit-def: $vgpr62
@@ -41099,8 +41060,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v3, v36, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v4, v35, v57, s6
-; GFX9-NEXT:    v_perm_b32 v5, v63, v56, s6
-; GFX9-NEXT:    v_perm_b32 v6, v34, v47, s6
+; GFX9-NEXT:    v_perm_b32 v5, v34, v56, s6
+; GFX9-NEXT:    v_perm_b32 v6, v63, v47, s6
 ; GFX9-NEXT:    v_perm_b32 v7, v33, v46, s6
 ; GFX9-NEXT:    v_perm_b32 v8, v32, v45, s6
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
@@ -43954,27 +43915,29 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_lshr_b32 s44, s5, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, s44
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_cvt_f32_f16_e32 v59, s44
 ; SI-NEXT:    s_lshr_b32 s44, s4, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, s44
+; SI-NEXT:    v_cvt_f32_f16_e32 v36, s44
 ; SI-NEXT:    s_lshr_b32 s44, s7, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, s44
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, s44
 ; SI-NEXT:    s_lshr_b32 s44, s6, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, s44
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v61, s44
 ; SI-NEXT:    s_lshr_b32 s44, s9, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, s44
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, s44
 ; SI-NEXT:    s_lshr_b32 s44, s8, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s44
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, s44
 ; SI-NEXT:    s_lshr_b32 s44, s11, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, s44
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, s44
 ; SI-NEXT:    s_lshr_b32 s44, s10, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s44
 ; SI-NEXT:    s_lshr_b32 s44, s13, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, s44
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s44
 ; SI-NEXT:    s_lshr_b32 s44, s12, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, s44
 ; SI-NEXT:    s_lshr_b32 s44, s15, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, s44
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, s44
 ; SI-NEXT:    s_lshr_b32 s44, s14, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, s44
 ; SI-NEXT:    s_lshr_b32 s44, s41, 16
@@ -44010,24 +43973,21 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    s_lshr_b32 s44, s18, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v56, s44
 ; SI-NEXT:    s_lshr_b32 s44, s17, 16
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, s44
 ; SI-NEXT:    s_lshr_b32 s44, s16, 16
-; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, s44
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, s5
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, s4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, s7
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, s9
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, s8
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, s11
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, s10
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v62, s9
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, s8
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, s11
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, s10
+; SI-NEXT:    v_cvt_f32_f16_e32 v62, s13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, s13
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, s12
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, s12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v23, s15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, s14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, s41
@@ -44045,9 +44005,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, s21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, s20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, s19
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, s18
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, s18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, s17
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, s16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s16
 ; SI-NEXT:    s_cbranch_execnz .LBB53_3
 ; SI-NEXT:  .LBB53_2: ; %cmp.true
 ; SI-NEXT:    v_add_f64 v[1:2], s[16:17], 1.0
@@ -44068,107 +44028,104 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v42
+; SI-NEXT:    v_add_f64 v[15:16], s[10:11], 1.0
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v53
 ; SI-NEXT:    v_add_f64 v[20:21], s[12:13], 1.0
-; SI-NEXT:    v_add_f64 v[11:12], s[8:9], 1.0
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v54
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v63, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v62, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v63, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v58
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v57
 ; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v57, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f64 v[37:38], s[26:27], 1.0
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v37, v55
 ; SI-NEXT:    v_add_f64 v[7:8], s[6:7], 1.0
 ; SI-NEXT:    v_add_f64 v[34:35], s[28:29], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v61, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v53
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f64 v[3:4], s[4:5], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v55
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v41
-; SI-NEXT:    v_add_f64 v[37:38], s[26:27], 1.0
-; SI-NEXT:    v_add_f64 v[15:16], s[10:11], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v44
 ; SI-NEXT:    v_add_f64 v[30:31], s[42:43], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v62, v12
+; SI-NEXT:    v_add_f64 v[11:12], s[8:9], 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v44
 ; SI-NEXT:    v_add_f64 v[49:50], s[24:25], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v50
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v56
-; SI-NEXT:    v_cvt_f32_f16_e32 v59, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v49, v46
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_f64 v[3:4], s[4:5], 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v51
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v56
 ; SI-NEXT:    v_add_f64 v[26:27], s[40:41], 1.0
 ; SI-NEXT:    v_add_f64 v[22:23], s[14:15], 1.0
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v54
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v61, v61
 ; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v48, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v18
-; SI-NEXT:    v_mov_b32_e32 v18, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v59
+; SI-NEXT:    v_mov_b32_e32 v59, v22
 ; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v13
+; SI-NEXT:    v_mov_b32_e32 v13, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v58
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v55, v6
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -44187,9 +44144,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v6
 ; SI-NEXT:  .LBB53_3: ; %end
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v60
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v9, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v58
@@ -44201,7 +44158,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v9, v6, s[0:3], 0 offen
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v56
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v47
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v21
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 8, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_or_b32_e32 v6, v9, v6
@@ -44271,125 +44228,125 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v3
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 48, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 48, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v6, v9, v6
-; SI-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v3, v3, v6
+; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v35
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 52, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v6, v9, v6
-; SI-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v35
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 52, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
+; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v12
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 56, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v6, v9, v6
-; SI-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v12
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 56, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
+; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v31
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 60, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v6, v9, v6
-; SI-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v31
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 60, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
+; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v11
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v6, v9, v6
-; SI-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v11
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
+; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v27
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x44, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v6, v9, v6
-; SI-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v27
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x44, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
+; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v22
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x48, v0
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x54, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v4, v4, v6
-; SI-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v22
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x48, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v45
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v23
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x4c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
-; SI-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v43
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v23
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x4c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v21
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x50, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
-; SI-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v63
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x50, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v63
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
-; SI-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v62
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x54, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v43
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v47
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x58, v0
-; SI-NEXT:    v_or_b32_e32 v1, v4, v1
-; SI-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x58, v0
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v40
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x5c, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v45
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x5c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v4, v1
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v62
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v19
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -44413,11 +44370,11 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
-; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr20
 ; SI-NEXT:    ; implicit-def: $vgpr46
@@ -44452,27 +44409,27 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr59
 ; SI-NEXT:    s_branch .LBB53_2
 ;
 ; VI-LABEL: bitcast_v15f64_to_v60f16_scalar:
@@ -47008,9 +46965,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, s27
@@ -47089,9 +47046,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v10, v2
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
@@ -47119,13 +47076,13 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
-; SI-NEXT:    v_or_b32_e32 v3, v11, v3
+; SI-NEXT:    v_or_b32_e32 v2, v11, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v63
@@ -47290,7 +47247,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -47312,7 +47269,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; SI-NEXT:    s_waitcnt vmcnt(11)
@@ -50068,7 +50025,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v49
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_2
@@ -50090,123 +50047,118 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s18
-; SI-NEXT:    v_mov_b32_e32 v48, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v39
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v4
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s19
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v38
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v5
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s20
-; SI-NEXT:    v_mov_b32_e32 v39, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v37
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v6
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s21
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v36
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v7
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s22
-; SI-NEXT:    v_mov_b32_e32 v38, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v35
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v8
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s23
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v37
+; SI-NEXT:    v_mov_b32_e32 v35, v34
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v9
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s24
-; SI-NEXT:    v_mov_b32_e32 v37, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v34
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v10
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s25
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v36
+; SI-NEXT:    v_mov_b32_e32 v34, v33
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v11
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s26
-; SI-NEXT:    v_mov_b32_e32 v36, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v33
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v12
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s27
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v35
+; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v13
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s28
-; SI-NEXT:    v_mov_b32_e32 v35, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v32
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v14
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s29
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v34
+; SI-NEXT:    v_mov_b32_e32 v32, v50
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v15
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v28
-; SI-NEXT:    v_mov_b32_e32 v34, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v50
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v16
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v63
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v17
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v59
-; SI-NEXT:    v_mov_b32_e32 v33, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v62
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v32
-; SI-NEXT:    v_mov_b32_e32 v32, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v50
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v61
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v60
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v61
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v58
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v60
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v58
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v21
@@ -50235,31 +50187,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; kill: killed $vgpr49
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v48, v39
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; kill: killed $vgpr49
-; SI-NEXT:    v_mov_b32_e32 v39, v38
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; kill: killed $vgpr49
-; SI-NEXT:    v_mov_b32_e32 v38, v37
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; kill: killed $vgpr49
-; SI-NEXT:    v_mov_b32_e32 v37, v36
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; kill: killed $vgpr49
-; SI-NEXT:    v_mov_b32_e32 v36, v35
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; kill: killed $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; kill: killed $vgpr49
 ; SI-NEXT:    v_mov_b32_e32 v35, v34
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
@@ -50327,6 +50254,26 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    ; kill: killed $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; kill: killed $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; kill: killed $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; kill: killed $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; kill: killed $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; kill: killed $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; kill: killed $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; kill: killed $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr31
@@ -50362,11 +50309,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v33
 ; SI-NEXT:    v_add_i32_e32 v33, vcc, 3, v34
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, 3, v35
-; SI-NEXT:    v_add_i32_e32 v35, vcc, 3, v36
-; SI-NEXT:    v_add_i32_e32 v36, vcc, 3, v37
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 3, v38
-; SI-NEXT:    v_add_i32_e32 v38, vcc, 3, v39
-; SI-NEXT:    v_add_i32_e32 v39, vcc, 3, v48
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s16
 ; SI-NEXT:    s_add_i32 s17, s17, 3
@@ -50409,45 +50351,50 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s23
 ; SI-NEXT:    v_add_i32_e32 v63, vcc, 3, v63
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v39
+; SI-NEXT:    v_add_i32_e32 v36, vcc, 3, v36
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s24
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v36
+; SI-NEXT:    v_add_i32_e32 v37, vcc, 3, v37
+; SI-NEXT:    v_add_i32_e32 v38, vcc, 3, v38
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s25
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v33
+; SI-NEXT:    v_add_i32_e32 v39, vcc, 3, v39
+; SI-NEXT:    v_add_i32_e32 v48, vcc, 3, v48
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s26
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v57, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v38
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s27
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v60
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_add_i32_e32 v48, vcc, 3, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v47, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v40, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v54, v34
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s28
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v43, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v52, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v31
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_add_i32_e32 v35, vcc, 3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, s29
+; SI-NEXT:    v_cvt_f32_f16_e32 v45, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v55, v62
 ; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v49, v61
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v53, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v51, v58
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v1
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -51443,62 +51390,53 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB57_4
 ; GFX11-TRUE16-NEXT:  .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v12, 16, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v14, 16, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v12, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v13, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v14, 16, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v12, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v14, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v15, 16, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v14, 16, v1
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s22
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v13, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v15, 16, v5
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v12, 16, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v14, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v15, 16, v0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v12, 16, v3
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s22
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v13, 16, v2
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s15, s11
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, s14, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s21
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, s11, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s20
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v12, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v13, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v15, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s14, s10
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s11, s9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v1
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s10, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s19
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, s9, 3 op_sel_hi:[1,0]
@@ -51510,13 +51448,13 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v65.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, s8, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s17
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, s7, 3 op_sel_hi:[1,0]
@@ -51543,16 +51481,18 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v32, s2, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v31, s3, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v30, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v67.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v68.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v37
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v36
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v32
@@ -51568,14 +51508,21 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v67.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v68.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v83.l
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
 ; GFX11-TRUE16-NEXT:  .LBB57_3:
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
@@ -51598,9 +51545,9 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, s2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, s45
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, s44
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, s43
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, s43
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, s42
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, s41
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, s15
@@ -51649,8 +51596,8 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v25, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v27, 16, v18
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v19, 16, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v20, 16, v1
@@ -51992,48 +51939,48 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:88
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v2
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v4
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v13
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v18
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v28
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v51
@@ -52043,7 +51990,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v62
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v63
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, v36
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v36
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v50
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v54
@@ -52057,26 +52004,28 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v56
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v57
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v57
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, v58
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v31
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v58
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, v32
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v59
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v34
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v35
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v42, v3
 ; SI-NEXT:    s_waitcnt vmcnt(5)
@@ -52084,19 +52033,21 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v34
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v35
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v46
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v47
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v47, v21
-; SI-NEXT:    v_mov_b32_e32 v56, v6
+; SI-NEXT:    v_mov_b32_e32 v56, v17
+; SI-NEXT:    v_mov_b32_e32 v57, v6
 ; SI-NEXT:    v_mov_b32_e32 v58, v7
+; SI-NEXT:    v_mov_b32_e32 v59, v33
 ; SI-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB58_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.true
@@ -52104,34 +52055,34 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v62
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v63
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_or_b32_e32 v3, v3, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v10
-; SI-NEXT:    v_or_b32_e32 v9, v9, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v34
-; SI-NEXT:    v_or_b32_e32 v31, v31, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v63
-; SI-NEXT:    v_or_b32_e32 v57, v6, v35
+; SI-NEXT:    v_or_b32_e32 v3, v3, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
+; SI-NEXT:    v_or_b32_e32 v9, v9, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
+; SI-NEXT:    v_or_b32_e32 v31, v31, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v62
+; SI-NEXT:    v_or_b32_e32 v63, v6, v34
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
@@ -52147,7 +52098,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
 ; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
@@ -52158,119 +52109,114 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
 ; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
 ; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
 ; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_or_b32_e32 v12, v12, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v16
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_or_b32_e32 v12, v12, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_or_b32_e32 v15, v15, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v19
+; SI-NEXT:    v_or_b32_e32 v15, v15, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_or_b32_e32 v18, v18, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_or_b32_e32 v22, v22, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
-; SI-NEXT:    v_or_b32_e32 v26, v26, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v8
-; SI-NEXT:    v_or_b32_e32 v11, v11, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v20
-; SI-NEXT:    v_or_b32_e32 v1, v1, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v59
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v35
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_or_b32_e32 v18, v18, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v45, 0x38000000, v45
+; SI-NEXT:    v_or_b32_e32 v22, v22, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v27
+; SI-NEXT:    v_or_b32_e32 v26, v26, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v5
+; SI-NEXT:    v_or_b32_e32 v11, v11, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; SI-NEXT:    v_or_b32_e32 v2, v2, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v34
+; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_or_b32_e32 v2, v2, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, v45
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v35
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v45
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v48, v35, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v53
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v37, v34, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v58
+; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v34
+; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_or_b32_e32 v48, v34, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v52
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v35
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v34
+; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v53
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_or_b32_e32 v52, v35, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v55
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT:    v_or_b32_e32 v52, v34, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v55
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v35
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
+; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v34
+; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_or_b32_e32 v55, v35, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v60
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT:    v_or_b32_e32 v55, v34, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v34, v60
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT:    v_or_b32_e32 v6, v37, v35
+; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT:    v_or_b32_e32 v6, v35, v34
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
 ; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v36, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
+; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
 ; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
 ; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
 ; SI-NEXT:    v_add_f32_e32 v43, 0x38000000, v43
 ; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
 ; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
 ; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
@@ -52281,10 +52227,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
 ; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
 ; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
@@ -52292,145 +52235,158 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v24
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v28
 ; SI-NEXT:    v_or_b32_e32 v54, v54, v51
 ; SI-NEXT:    v_or_b32_e32 v50, v50, v30
-; SI-NEXT:    v_or_b32_e32 v36, v36, v32
-; SI-NEXT:    v_or_b32_e32 v33, v33, v42
 ; SI-NEXT:    v_or_b32_e32 v39, v39, v41
-; SI-NEXT:    v_alignbit_b32 v60, v55, v35, 16
-; SI-NEXT:    v_alignbit_b32 v59, v52, v45, 16
+; SI-NEXT:    v_alignbit_b32 v60, v55, v34, 16
 ; SI-NEXT:    v_alignbit_b32 v24, v26, v24, 16
 ; SI-NEXT:    v_alignbit_b32 v28, v22, v28, 16
 ; SI-NEXT:    v_alignbit_b32 v51, v12, v51, 16
-; SI-NEXT:    v_alignbit_b32 v30, v57, v30, 16
-; SI-NEXT:    v_alignbit_b32 v32, v31, v32, 16
-; SI-NEXT:    v_alignbit_b32 v42, v9, v42, 16
+; SI-NEXT:    v_alignbit_b32 v30, v63, v30, 16
 ; SI-NEXT:    v_alignbit_b32 v41, v3, v41, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v6
-; SI-NEXT:    v_or_b32_e32 v6, v37, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v6
+; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT:    v_or_b32_e32 v6, v35, v1
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v46
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v58
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_or_b32_e32 v6, v37, v5
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v57
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT:    v_alignbit_b32 v1, v52, v1, 16
 ; SI-NEXT:    v_add_f32_e32 v46, 0x38000000, v46
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v56
+; SI-NEXT:    v_or_b32_e32 v58, v35, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v56
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v46
-; SI-NEXT:    v_alignbit_b32 v5, v48, v5, 16
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_or_b32_e32 v58, v46, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT:    v_alignbit_b32 v8, v48, v8, 16
+; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
+; SI-NEXT:    v_or_b32_e32 v57, v46, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
 ; SI-NEXT:    v_cvt_f32_f16_e32 v46, v47
-; SI-NEXT:    v_alignbit_b32 v14, v2, v14, 16
-; SI-NEXT:    v_or_b32_e32 v56, v37, v17
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v46
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v62
-; SI-NEXT:    v_alignbit_b32 v17, v1, v17, 16
+; SI-NEXT:    v_alignbit_b32 v14, v37, v14, 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_or_b32_e32 v56, v35, v17
+; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v46, v59
 ; SI-NEXT:    v_add_f32_e32 v46, 0x38000000, v46
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v46
-; SI-NEXT:    v_or_b32_e32 v62, v46, v43
+; SI-NEXT:    v_or_b32_e32 v59, v46, v43
 ; SI-NEXT:    v_alignbit_b32 v43, v15, v43, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_or_b32_e32 v47, v37, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v44
+; SI-NEXT:    v_or_b32_e32 v47, v35, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v35, v44
 ; SI-NEXT:    v_cvt_f32_f16_e32 v44, v61
-; SI-NEXT:    v_alignbit_b32 v6, v11, v21, 16
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT:    v_add_f32_e32 v44, 0x38000000, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
+; SI-NEXT:    v_or_b32_e32 v61, v44, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v44, v45
 ; SI-NEXT:    v_add_f32_e32 v44, 0x38000000, v44
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
-; SI-NEXT:    v_or_b32_e32 v61, v44, v37
-; SI-NEXT:    v_alignbit_b32 v44, v18, v37, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v44
+; SI-NEXT:    v_or_b32_e32 v36, v36, v45
+; SI-NEXT:    v_alignbit_b32 v44, v18, v35, 16
+; SI-NEXT:    v_alignbit_b32 v45, v31, v45, 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v33, v6
+; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
+; SI-NEXT:    v_or_b32_e32 v6, v33, v42
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v6, v2, v17, 16
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v6, v11, v21, 16
+; SI-NEXT:    v_alignbit_b32 v42, v9, v42, 16
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB58_2: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v35, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v35, v35, v37
-; SI-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v34, 0xffff, v6
+; SI-NEXT:    v_or_b32_e32 v34, v34, v35
+; SI-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v35, 0xffff, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v40
-; SI-NEXT:    v_or_b32_e32 v35, v35, v37
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v35, v37, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v34, 0xffff, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v40
+; SI-NEXT:    v_or_b32_e32 v34, v34, v35
+; SI-NEXT:    v_add_i32_e32 v35, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v34, v35, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v59
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v35, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v35, v35, v37
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v35, v37, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v35, 0xffff, v52
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v53
-; SI-NEXT:    v_or_b32_e32 v35, v35, v37
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v35, v37, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v35, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v5, v35, v5
-; SI-NEXT:    v_add_i32_e32 v35, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v5, v35, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v34, 0xffff, v6
+; SI-NEXT:    v_or_b32_e32 v1, v34, v1
+; SI-NEXT:    v_add_i32_e32 v34, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v34, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v53
+; SI-NEXT:    v_or_b32_e32 v1, v1, v34
+; SI-NEXT:    v_add_i32_e32 v34, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v34, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v58
+; SI-NEXT:    v_or_b32_e32 v1, v1, v8
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v49
+; SI-NEXT:    v_or_b32_e32 v1, v1, v8
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v49
-; SI-NEXT:    v_or_b32_e32 v5, v5, v35
-; SI-NEXT:    v_add_i32_e32 v35, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v5, v35, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v58
-; SI-NEXT:    v_or_b32_e32 v5, v5, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v5, v14, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v38
-; SI-NEXT:    v_or_b32_e32 v2, v2, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; SI-NEXT:    v_or_b32_e32 v1, v1, v8
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v56
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v17
-; SI-NEXT:    v_or_b32_e32 v2, v2, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v38
+; SI-NEXT:    v_or_b32_e32 v1, v1, v8
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v56
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; SI-NEXT:    v_or_b32_e32 v1, v1, v8
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
@@ -52445,7 +52401,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -52486,7 +52442,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v62
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v59
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v43
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
@@ -52516,26 +52472,28 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v63
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v63
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v62
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v45
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v33
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -53230,9 +53188,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v6
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v11
@@ -53317,7 +53275,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v33, v11
 ; SI-NEXT:    v_mov_b32_e32 v11, v8
 ; SI-NEXT:    v_mov_b32_e32 v8, v5
-; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v5, v42
+; SI-NEXT:    v_mov_b32_e32 v42, v1
 ; SI-NEXT:    s_cbranch_vccnz .LBB59_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -54326,62 +54285,53 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB59_4
 ; GFX11-TRUE16-NEXT:  .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v12, 16, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v14, 16, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v12, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v13, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v14, 16, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v12, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v14, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v15, 16, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v14, 16, v1
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s22
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v13, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v15, 16, v5
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v12, 16, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v14, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v15, 16, v0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v12, 16, v3
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s22
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v13, 16, v2
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s15, s11
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, s14 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s21
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, s11 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s20
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v12, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v13, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v15, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s14, s10
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s11, s9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v1
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s10 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s19
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, s9 op_sel_hi:[0,1]
@@ -54393,13 +54343,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v65.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, s8 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s17
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, s7 op_sel_hi:[0,1]
@@ -54426,16 +54376,18 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v32, 0x200, s2 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v31, 0x200, s3 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v30, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v67.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v68.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v37
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v36
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v32
@@ -54451,14 +54403,21 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v67.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v68.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v83.l
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
 ; GFX11-TRUE16-NEXT:  .LBB59_3:
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
@@ -54481,9 +54440,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, s2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, s45
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, s45
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, s44
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, s43
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, s43
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, s42
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, s41
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, s15
@@ -54532,8 +54491,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v25, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v27, 16, v18
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v19, 16, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v20, 16, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 8a2602e668a15..f888f4f3b1407 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -7895,11 +7895,11 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[16:17], 24, v[10:11]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[12:13]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v13.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v10.l
 ; GFX11-TRUE16-NEXT:  .LBB38_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB38_4
@@ -7969,7 +7969,6 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
 ; GFX11-TRUE16-NEXT:  .LBB38_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v14.l
@@ -11042,57 +11041,62 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) {
 ; VI-LABEL: bitcast_v6i16_to_v12i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v8, v2
-; VI-NEXT:    v_mov_b32_e32 v14, v1
-; VI-NEXT:    v_mov_b32_e32 v13, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; VI-NEXT:    ; implicit-def: $vgpr16
+; VI-NEXT:    ; implicit-def: $vgpr15
 ; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    ; implicit-def: $vgpr14
 ; VI-NEXT:    ; implicit-def: $vgpr5
 ; VI-NEXT:    ; implicit-def: $vgpr7
+; VI-NEXT:    ; implicit-def: $vgpr8
 ; VI-NEXT:    ; implicit-def: $vgpr9
 ; VI-NEXT:    ; implicit-def: $vgpr11
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; VI-NEXT:    s_cbranch_execz .LBB46_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
-; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
-; VI-NEXT:  ; %bb.2: ; %Flow
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 8, v0
+; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[2:3]
+; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v16, v0
+; VI-NEXT:    v_mov_b32_e32 v14, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    ; implicit-def: $vgpr2
+; VI-NEXT:  .LBB46_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB46_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_mov_b32_e32 v3, 3
-; VI-NEXT:    v_add_u16_sdwa v6, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v15, 3, v14
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; VI-NEXT:    v_add_u16_sdwa v2, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v10, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v1, v15, v0
-; VI-NEXT:    v_add_u16_e32 v14, 3, v13
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; VI-NEXT:    v_add_u16_e32 v16, 3, v8
-; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
-; VI-NEXT:    v_or_b32_e32 v0, v14, v0
-; VI-NEXT:    v_or_b32_e32 v7, v16, v3
+; VI-NEXT:    v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v13, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_e32 v14, 3, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
+; VI-NEXT:    v_add_u16_e32 v16, 3, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v13
+; VI-NEXT:    v_or_b32_e32 v1, v14, v1
+; VI-NEXT:    v_or_b32_e32 v0, v16, v0
+; VI-NEXT:    v_add_u16_sdwa v10, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_e32 v8, 3, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
 ; VI-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[7:8]
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
+; VI-NEXT:    v_or_b32_e32 v2, v8, v2
+; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[2:3]
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 8, v0
 ; VI-NEXT:    v_bfe_u32 v7, v6, 8, 8
-; VI-NEXT:    v_mov_b32_e32 v13, v14
-; VI-NEXT:    v_mov_b32_e32 v14, v15
-; VI-NEXT:    v_mov_b32_e32 v8, v16
 ; VI-NEXT:  .LBB46_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v0, v13
+; VI-NEXT:    v_mov_b32_e32 v0, v16
+; VI-NEXT:    v_mov_b32_e32 v1, v15
+; VI-NEXT:    v_mov_b32_e32 v2, v13
 ; VI-NEXT:    v_mov_b32_e32 v4, v14
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index c4c977f86c299..70211c302829c 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -7632,8 +7632,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX7LESS-NEXT:    s_load_dword s10, s[4:5], 0xd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v4, s7, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX7LESS-NEXT:    ; implicit-def: $vgpr0
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB13_4
@@ -7653,28 +7653,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX7LESS-NEXT:    s_not_b32 s13, s12
 ; GFX7LESS-NEXT:    s_lshl_b32 s14, s6, s11
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s15
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
 ; GFX7LESS-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s14, v4
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s14, v1
 ; GFX7LESS-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX7LESS-NEXT:    v_and_b32_e32 v1, s13, v4
-; GFX7LESS-NEXT:    v_or_b32_e32 v3, v1, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_and_b32_e32 v2, s13, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX7LESS-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX7LESS-NEXT:  .LBB13_4: ; %Flow
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7684,7 +7683,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    s_and_b32 s5, s10, 0xff
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7LESS-NEXT:    v_mad_u32_u24 v0, s5, v2, v0
+; GFX7LESS-NEXT:    v_mad_u32_u24 v0, s5, v4, v0
 ; GFX7LESS-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -7694,8 +7693,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX8-NEXT:    s_load_dword s10, s[4:5], 0x34
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB13_4
@@ -7714,27 +7713,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX8-NEXT:    s_lshl_b32 s14, s2, s11
 ; GFX8-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s14, v4
-; GFX8-NEXT:    v_and_b32_e32 v1, s13, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s14, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, s13, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX8-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX8-NEXT:  .LBB13_4: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -7743,7 +7742,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_mad_u16 v0, s10, v2, v0
+; GFX8-NEXT:    v_mad_u16 v0, s10, v4, v0
 ; GFX8-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -7753,8 +7752,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX9-NEXT:    s_load_dword s10, s[4:5], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB13_4
@@ -7773,26 +7772,26 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX9-NEXT:    s_lshl_b32 s14, s2, s11
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, s14, v4
+; GFX9-NEXT:    v_add_u32_e32 v0, s14, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX9-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX9-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX9-NEXT:  .LBB13_4: ; %Flow
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -7801,7 +7800,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_mad_legacy_u16 v0, s10, v2, v0
+; GFX9-NEXT:    v_mad_legacy_u16 v0, s10, v4, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -7812,9 +7811,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1064-NEXT:    s_load_dword s10, s[4:5], 0x34
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX1064-NEXT:  ; %bb.1:
@@ -7833,32 +7832,32 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s14, v4
+; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s14, v1
 ; GFX1064-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1064-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1064-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1064-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1064-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1064-NEXT:  .LBB13_4: ; %Flow
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT:    v_mad_u16 v0, s10, v2, s2
+; GFX1064-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -7870,9 +7869,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1032-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX1032-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s10, 0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
 ; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX1032-NEXT:  ; %bb.1:
@@ -7890,32 +7889,32 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1032-NEXT:    s_lshl_b32 s12, s6, s2
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s12, v4
+; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s12, v1
 ; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1032-NEXT:    v_and_or_b32 v3, v4, s11, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1032-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1032-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1032-NEXT:    s_or_b32 s10, vcc_lo, s10
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s10
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1032-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1032-NEXT:  .LBB13_4: ; %Flow
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT:    v_mad_u16 v0, s8, v2, s2
+; GFX1032-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
@@ -7929,9 +7928,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1164-TRUE16-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1164-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
 ; GFX1164-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
-; GFX1164-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-TRUE16-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX1164-TRUE16-NEXT:  ; %bb.1:
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7950,29 +7949,29 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1164-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-TRUE16-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX1164-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v4
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
 ; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1164-TRUE16-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-TRUE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-TRUE16-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1164-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1164-TRUE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -7981,7 +7980,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v2.l, s2
+; GFX1164-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX1164-TRUE16-NEXT:    s_endpgm
@@ -7995,9 +7994,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1164-FAKE16-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1164-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
 ; GFX1164-FAKE16-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-FAKE16-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX1164-FAKE16-NEXT:  ; %bb.1:
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8015,29 +8014,29 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1164-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-FAKE16-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX1164-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v4
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
 ; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1164-FAKE16-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-FAKE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-FAKE16-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1164-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1164-FAKE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8046,7 +8045,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mad_u16 v0, s10, v2, s2
+; GFX1164-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX1164-FAKE16-NEXT:    s_endpgm
@@ -8058,11 +8057,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1132-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s10, 0
-; GFX1132-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX1132-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s9, exec_lo
 ; GFX1132-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-TRUE16-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX1132-TRUE16-NEXT:  ; %bb.1:
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8080,27 +8079,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1132-TRUE16-NEXT:    s_lshl_b32 s12, s6, s2
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-TRUE16-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX1132-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v4
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
 ; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1132-TRUE16-NEXT:    v_and_or_b32 v3, v4, s11, v0
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-TRUE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
 ; GFX1132-TRUE16-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1132-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-TRUE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8109,7 +8108,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v2.l, s2
+; GFX1132-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX1132-TRUE16-NEXT:    s_endpgm
@@ -8121,11 +8120,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1132-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s10, 0
-; GFX1132-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX1132-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s9, exec_lo
 ; GFX1132-FAKE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-FAKE16-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX1132-FAKE16-NEXT:  ; %bb.1:
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8142,27 +8141,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1132-FAKE16-NEXT:    s_lshl_b32 s12, s6, s2
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-FAKE16-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX1132-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v4
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
 ; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1132-FAKE16-NEXT:    v_and_or_b32 v3, v4, s11, v0
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-FAKE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
 ; GFX1132-FAKE16-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1132-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-FAKE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8171,7 +8170,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_mad_u16 v0, s8, v2, s2
+; GFX1132-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX1132-FAKE16-NEXT:    s_endpgm
@@ -8185,9 +8184,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1264-TRUE16-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1264-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
 ; GFX1264-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
-; GFX1264-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1264-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1264-TRUE16-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX1264-TRUE16-NEXT:  ; %bb.1:
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
@@ -8208,29 +8207,28 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1264-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1264-TRUE16-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX1264-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1264-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v4
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
 ; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1264-TRUE16-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1264-TRUE16-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1264-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1264-TRUE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8240,7 +8238,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1264-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v2.l, s2
+; GFX1264-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
 ; GFX1264-TRUE16-NEXT:    s_endpgm
@@ -8254,9 +8252,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1264-FAKE16-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1264-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
 ; GFX1264-FAKE16-NEXT:    ; implicit-def: $vgpr0
-; GFX1264-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1264-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1264-FAKE16-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX1264-FAKE16-NEXT:  ; %bb.1:
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -8275,29 +8273,28 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1264-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1264-FAKE16-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX1264-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1264-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v4
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
 ; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1264-FAKE16-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1264-FAKE16-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1264-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1264-FAKE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8307,7 +8304,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1264-FAKE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mad_u16 v0, s10, v2, s2
+; GFX1264-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
 ; GFX1264-FAKE16-NEXT:    s_endpgm
@@ -8319,11 +8316,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1232-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s10, 0
-; GFX1232-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX1232-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s9, exec_lo
 ; GFX1232-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1232-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1232-TRUE16-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX1232-TRUE16-NEXT:  ; %bb.1:
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
@@ -8344,28 +8341,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1232-TRUE16-NEXT:    s_lshl_b32 s12, s6, s2
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232-TRUE16-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX1232-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1232-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v4
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
 ; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1232-TRUE16-NEXT:    v_and_or_b32 v3, v4, s11, v0
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-TRUE16-NEXT:    s_or_b32 s10, vcc_lo, s10
 ; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
 ; GFX1232-TRUE16-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1232-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-TRUE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8375,7 +8371,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1232-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v2.l, s2
+; GFX1232-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
 ; GFX1232-TRUE16-NEXT:    s_endpgm
@@ -8387,11 +8383,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1232-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s10, 0
-; GFX1232-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX1232-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s9, exec_lo
 ; GFX1232-FAKE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1232-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1232-FAKE16-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX1232-FAKE16-NEXT:  ; %bb.1:
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -8411,28 +8407,27 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1232-FAKE16-NEXT:    s_lshl_b32 s12, s6, s2
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232-FAKE16-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX1232-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1232-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v4
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
 ; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1232-FAKE16-NEXT:    v_and_or_b32 v3, v4, s11, v0
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-FAKE16-NEXT:    s_or_b32 s10, vcc_lo, s10
 ; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
 ; GFX1232-FAKE16-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX1232-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-FAKE16-NEXT:  .LBB13_4: ; %Flow
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -8442,7 +8437,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1232-FAKE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_mad_u16 v0, s8, v2, s2
+; GFX1232-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
 ; GFX1232-FAKE16-NEXT:    s_endpgm
@@ -8469,28 +8464,27 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
 ; GFX7LESS-NEXT:    s_lshl_b32 s2, s3, s10
 ; GFX7LESS-NEXT:    s_not_b32 s3, s11
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
 ; GFX7LESS-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_and_b32_e32 v0, s3, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_and_b32_e32 v0, s3, v2
-; GFX7LESS-NEXT:    v_or_b32_e32 v1, s2, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v0
+; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX7LESS-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -8510,27 +8504,27 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
 ; GFX8-NEXT:    s_lshl_b32 s10, s2, s8
 ; GFX8-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_and_b32_e32 v0, s9, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, s9, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, s10, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX8-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -8550,27 +8544,27 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
 ; GFX9-NEXT:    s_lshl_b32 s10, s2, s8
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_and_b32_e32 v0, s9, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, s9, v2
-; GFX9-NEXT:    v_or_b32_e32 v1, s10, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -8592,23 +8586,23 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
 ; GFX1064-NEXT:    s_not_b32 s9, s2
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_and_or_b32 v0, v1, s9, s10
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    v_and_or_b32 v1, v2, s9, s10
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1064-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_byte v0, off, s[0:3], 0
@@ -8632,23 +8626,23 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
 ; GFX1032-NEXT:    s_not_b32 s3, s3
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_and_or_b32 v0, v1, s3, s8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    v_and_or_b32 v1, v2, s3, s8
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1032-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1032-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_byte v0, off, s[0:3], 0
@@ -8672,26 +8666,25 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
 ; GFX1164-NEXT:    s_not_b32 s9, s2
 ; GFX1164-NEXT:    s_mov_b32 s6, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_and_or_b32 v0, v1, s9, s10
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-NEXT:    v_and_or_b32 v1, v2, s9, s10
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
@@ -8715,26 +8708,24 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
 ; GFX1132-NEXT:    s_not_b32 s3, s3
 ; GFX1132-NEXT:    s_mov_b32 s6, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-NEXT:    v_and_or_b32 v1, v2, s3, s8
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-NEXT:    v_and_or_b32 v0, v1, s3, s8
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
@@ -8758,26 +8749,25 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
 ; GFX1264-NEXT:    s_not_b32 s9, s2
 ; GFX1264-NEXT:    s_mov_b32 s6, -1
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1264-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1264-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1264-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-NEXT:    v_and_or_b32 v0, v1, s9, s10
+; GFX1264-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1264-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-NEXT:    v_and_or_b32 v1, v2, s9, s10
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1264-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX1264-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-NEXT:    buffer_store_b8 v0, off, s[0:3], null
@@ -8801,26 +8791,24 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
 ; GFX1232-NEXT:    s_not_b32 s3, s3
 ; GFX1232-NEXT:    s_mov_b32 s6, -1
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX1232-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1232-NEXT:    v_and_or_b32 v1, v2, s3, s8
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-NEXT:    v_and_or_b32 v0, v1, s3, s8
+; GFX1232-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
 ; GFX1232-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX1232-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-NEXT:    buffer_store_b8 v0, off, s[0:3], null
@@ -9346,8 +9334,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX7LESS-NEXT:    s_load_dword s10, s[4:5], 0xd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v4, s7, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX7LESS-NEXT:    ; implicit-def: $vgpr0
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB16_4
@@ -9367,28 +9355,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX7LESS-NEXT:    s_not_b32 s13, s12
 ; GFX7LESS-NEXT:    s_lshl_b32 s14, s6, s11
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s15
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
 ; GFX7LESS-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s14, v4
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s14, v1
 ; GFX7LESS-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX7LESS-NEXT:    v_and_b32_e32 v1, s13, v4
-; GFX7LESS-NEXT:    v_or_b32_e32 v3, v1, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_and_b32_e32 v2, s13, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX7LESS-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_bfe_u32 v0, v0, s11, 16
+; GFX7LESS-NEXT:    v_bfe_u32 v0, v2, s11, 16
 ; GFX7LESS-NEXT:  .LBB16_4: ; %Flow
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9398,7 +9385,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    s_and_b32 s5, s10, 0xffff
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7LESS-NEXT:    v_mad_u32_u24 v0, s5, v2, v0
+; GFX7LESS-NEXT:    v_mad_u32_u24 v0, s5, v4, v0
 ; GFX7LESS-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -9408,8 +9395,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX8-NEXT:    s_load_dword s10, s[4:5], 0x34
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB16_4
@@ -9428,27 +9415,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX8-NEXT:    s_lshl_b32 s14, s2, s11
 ; GFX8-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s14, v4
-; GFX8-NEXT:    v_and_b32_e32 v1, s13, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s14, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, s13, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX8-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX8-NEXT:  .LBB16_4: ; %Flow
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -9457,7 +9444,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_mad_u16 v0, s10, v2, v0
+; GFX8-NEXT:    v_mad_u16 v0, s10, v4, v0
 ; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -9467,8 +9454,8 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX9-NEXT:    s_load_dword s10, s[4:5], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB16_4
@@ -9487,26 +9474,26 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX9-NEXT:    s_lshl_b32 s14, s2, s11
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, s14, v4
+; GFX9-NEXT:    v_add_u32_e32 v0, s14, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX9-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX9-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX9-NEXT:  .LBB16_4: ; %Flow
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -9515,7 +9502,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_mad_legacy_u16 v0, s10, v2, v0
+; GFX9-NEXT:    v_mad_legacy_u16 v0, s10, v4, v0
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -9526,9 +9513,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1064-NEXT:    s_load_dword s10, s[4:5], 0x34
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1064-NEXT:  ; %bb.1:
@@ -9547,32 +9534,32 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s14, v4
+; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s14, v1
 ; GFX1064-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1064-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1064-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1064-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1064-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1064-NEXT:  .LBB16_4: ; %Flow
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT:    v_mad_u16 v0, s10, v2, s2
+; GFX1064-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -9584,9 +9571,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1032-NEXT:    s_load_dword s8, s[4:5], 0x34
 ; GFX1032-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s10, 0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
 ; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1032-NEXT:  ; %bb.1:
@@ -9604,32 +9591,32 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1032-NEXT:    s_lshl_b32 s12, s6, s2
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s12, v4
+; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s12, v1
 ; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1032-NEXT:    v_and_or_b32 v3, v4, s11, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1032-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1032-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1032-NEXT:    s_or_b32 s10, vcc_lo, s10
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s10
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1032-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1032-NEXT:  .LBB16_4: ; %Flow
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT:    v_mad_u16 v0, s8, v2, s2
+; GFX1032-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
@@ -9643,9 +9630,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1164-TRUE16-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1164-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
 ; GFX1164-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
-; GFX1164-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-TRUE16-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1164-TRUE16-NEXT:  ; %bb.1:
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9664,29 +9651,29 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1164-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-TRUE16-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1164-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v4
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
 ; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1164-TRUE16-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-TRUE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-TRUE16-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1164-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1164-TRUE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -9695,7 +9682,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v2.l, s2
+; GFX1164-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX1164-TRUE16-NEXT:    s_endpgm
@@ -9709,9 +9696,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1164-FAKE16-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1164-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
 ; GFX1164-FAKE16-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1164-FAKE16-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1164-FAKE16-NEXT:  ; %bb.1:
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9729,29 +9716,29 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1164-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-FAKE16-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1164-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1164-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v4
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
 ; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1164-FAKE16-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-FAKE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-FAKE16-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1164-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1164-FAKE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -9760,7 +9747,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mad_u16 v0, s10, v2, s2
+; GFX1164-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX1164-FAKE16-NEXT:    s_endpgm
@@ -9772,11 +9759,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1132-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s10, 0
-; GFX1132-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX1132-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s9, exec_lo
 ; GFX1132-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-TRUE16-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1132-TRUE16-NEXT:  ; %bb.1:
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9794,27 +9781,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1132-TRUE16-NEXT:    s_lshl_b32 s12, s6, s2
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-TRUE16-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1132-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v4
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
 ; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1132-TRUE16-NEXT:    v_and_or_b32 v3, v4, s11, v0
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-TRUE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
 ; GFX1132-TRUE16-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1132-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-TRUE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -9823,7 +9810,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v2.l, s2
+; GFX1132-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX1132-TRUE16-NEXT:    s_endpgm
@@ -9835,11 +9822,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1132-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s10, 0
-; GFX1132-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX1132-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s9, exec_lo
 ; GFX1132-FAKE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-FAKE16-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1132-FAKE16-NEXT:  ; %bb.1:
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9856,27 +9843,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1132-FAKE16-NEXT:    s_lshl_b32 s12, s6, s2
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-FAKE16-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1132-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1132-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v4
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
 ; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1132-FAKE16-NEXT:    v_and_or_b32 v3, v4, s11, v0
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-FAKE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
 ; GFX1132-FAKE16-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1132-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-FAKE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -9885,7 +9872,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_mad_u16 v0, s8, v2, s2
+; GFX1132-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX1132-FAKE16-NEXT:    s_endpgm
@@ -9899,9 +9886,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1264-TRUE16-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1264-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
 ; GFX1264-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
-; GFX1264-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1264-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1264-TRUE16-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1264-TRUE16-NEXT:  ; %bb.1:
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
@@ -9922,29 +9909,28 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1264-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1264-TRUE16-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1264-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1264-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v4
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
 ; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1264-TRUE16-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1264-TRUE16-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1264-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1264-TRUE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -9954,7 +9940,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1264-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v2.l, s2
+; GFX1264-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1264-TRUE16-NEXT:    s_endpgm
@@ -9968,9 +9954,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1264-FAKE16-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX1264-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
 ; GFX1264-FAKE16-NEXT:    ; implicit-def: $vgpr0
-; GFX1264-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1264-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1264-FAKE16-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1264-FAKE16-NEXT:  ; %bb.1:
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -9989,29 +9975,28 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1264-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1264-FAKE16-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1264-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1264-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v4
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
 ; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1264-FAKE16-NEXT:    v_and_or_b32 v3, v4, s13, v0
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1264-FAKE16-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1264-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v0
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
 ; GFX1264-FAKE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -10021,7 +10006,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1264-FAKE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mad_u16 v0, s10, v2, s2
+; GFX1264-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1264-FAKE16-NEXT:    s_endpgm
@@ -10033,11 +10018,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1232-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s10, 0
-; GFX1232-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX1232-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s9, exec_lo
 ; GFX1232-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1232-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1232-TRUE16-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1232-TRUE16-NEXT:  ; %bb.1:
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
@@ -10058,28 +10043,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1232-TRUE16-NEXT:    s_lshl_b32 s12, s6, s2
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232-TRUE16-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1232-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1232-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v4
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
 ; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1232-TRUE16-NEXT:    v_and_or_b32 v3, v4, s11, v0
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-TRUE16-NEXT:    s_or_b32 s10, vcc_lo, s10
 ; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
 ; GFX1232-TRUE16-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1232-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-TRUE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -10089,7 +10073,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1232-TRUE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v2.l, s2
+; GFX1232-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1232-TRUE16-NEXT:    s_endpgm
@@ -10101,11 +10085,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1232-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s10, 0
-; GFX1232-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX1232-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s9, exec_lo
 ; GFX1232-FAKE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1232-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1232-FAKE16-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1232-FAKE16-NEXT:  ; %bb.1:
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -10125,28 +10109,27 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1232-FAKE16-NEXT:    s_lshl_b32 s12, s6, s2
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232-FAKE16-NEXT:  .LBB16_2: ; %atomicrmw.start
 ; GFX1232-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1232-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v4
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
 ; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1232-FAKE16-NEXT:    v_and_or_b32 v3, v4, s11, v0
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
-; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-FAKE16-NEXT:    s_or_b32 s10, vcc_lo, s10
 ; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
 ; GFX1232-FAKE16-NEXT:    s_cbranch_execnz .LBB16_2
 ; GFX1232-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-FAKE16-NEXT:  .LBB16_4: ; %Flow
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -10156,7 +10139,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1232-FAKE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_mad_u16 v0, s8, v2, s2
+; GFX1232-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1232-FAKE16-NEXT:    s_endpgm
@@ -10183,28 +10166,27 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
 ; GFX7LESS-NEXT:    s_lshl_b32 s2, s3, s10
 ; GFX7LESS-NEXT:    s_not_b32 s3, s11
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
 ; GFX7LESS-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_and_b32_e32 v0, s3, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_and_b32_e32 v0, s3, v2
-; GFX7LESS-NEXT:    v_or_b32_e32 v1, s2, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v0
+; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX7LESS-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -10224,27 +10206,27 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
 ; GFX8-NEXT:    s_lshl_b32 s10, s2, s8
 ; GFX8-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_and_b32_e32 v0, s9, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, s9, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, s10, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -10264,27 +10246,27 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
 ; GFX9-NEXT:    s_lshl_b32 s10, s2, s8
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_and_b32_e32 v0, s9, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, s9, v2
-; GFX9-NEXT:    v_or_b32_e32 v1, s10, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -10306,23 +10288,23 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1064-NEXT:    s_not_b32 s9, s2
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_and_or_b32 v0, v1, s9, s10
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    v_and_or_b32 v1, v2, s9, s10
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1064-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -10346,23 +10328,23 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1032-NEXT:    s_not_b32 s3, s3
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_and_or_b32 v0, v1, s3, s8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    v_and_or_b32 v1, v2, s3, s8
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1032-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1032-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -10386,26 +10368,25 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1164-NEXT:    s_not_b32 s9, s2
 ; GFX1164-NEXT:    s_mov_b32 s6, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_and_or_b32 v0, v1, s9, s10
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-NEXT:    v_and_or_b32 v1, v2, s9, s10
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -10429,26 +10410,24 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1132-NEXT:    s_not_b32 s3, s3
 ; GFX1132-NEXT:    s_mov_b32 s6, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-NEXT:    v_and_or_b32 v1, v2, s3, s8
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-NEXT:    v_and_or_b32 v0, v1, s3, s8
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -10472,26 +10451,25 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1264-NEXT:    s_not_b32 s9, s2
 ; GFX1264-NEXT:    s_mov_b32 s6, -1
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1264-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1264-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1264-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-NEXT:    v_and_or_b32 v0, v1, s9, s10
+; GFX1264-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1264-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-NEXT:    v_and_or_b32 v1, v2, s9, s10
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1264-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1264-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -10515,26 +10493,24 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1232-NEXT:    s_not_b32 s3, s3
 ; GFX1232-NEXT:    s_mov_b32 s6, -1
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX1232-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1232-NEXT:    v_and_or_b32 v1, v2, s3, s8
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-NEXT:    v_and_or_b32 v0, v1, s3, s8
+; GFX1232-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
 ; GFX1232-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX1232-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -10560,36 +10536,35 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX7LESS-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX7LESS-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7LESS-NEXT:    s_lshl_b32 s2, 0xffff, s10
-; GFX7LESS-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7LESS-NEXT:    v_cvt_f32_f16_e32 v4, v0
 ; GFX7LESS-NEXT:    s_not_b32 s2, s2
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
 ; GFX7LESS-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v4
+; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
 ; GFX7LESS-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX7LESS-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX7LESS-NEXT:    v_and_b32_e32 v1, s2, v4
-; GFX7LESS-NEXT:    v_or_b32_e32 v3, v1, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_and_b32_e32 v2, s2, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v0
+; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX7LESS-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -10609,28 +10584,28 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX8-NEXT:    s_lshl_b32 s2, 0xffff, s10
 ; GFX8-NEXT:    s_not_b32 s2, s2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
 ; GFX8-NEXT:    v_add_f16_e32 v0, s11, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, s2, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, s2, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -10650,27 +10625,27 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s10
 ; GFX9-NEXT:    s_not_b32 s2, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
 ; GFX9-NEXT:    v_add_f16_e32 v0, s11, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX9-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -10690,26 +10665,26 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1064-NEXT:    s_lshl_b32 s2, 0xffff, s9
 ; GFX1064-NEXT:    s_not_b32 s10, s2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
+; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
 ; GFX1064-NEXT:    v_add_f16_e32 v0, s8, v0
 ; GFX1064-NEXT:    v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX1064-NEXT:    v_and_or_b32 v1, v2, s10, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1064-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1064-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s9, v0
+; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -10731,26 +10706,26 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1032-NEXT:    s_lshl_b32 s3, 0xffff, s2
 ; GFX1032-NEXT:    s_not_b32 s3, s3
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
 ; GFX1032-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
 ; GFX1032-NEXT:    v_add_f16_e32 v0, s8, v0
 ; GFX1032-NEXT:    v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX1032-NEXT:    v_and_or_b32 v1, v2, s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1032-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1032-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1032-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -10773,32 +10748,31 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1164-TRUE16-NEXT:    s_lshl_b32 s2, 0xffff, s9
 ; GFX1164-TRUE16-NEXT:    s_not_b32 s10, s2
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1164-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-TRUE16-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX1164-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
 ; GFX1164-TRUE16-NEXT:    v_add_f16_e32 v0.l, s8, v0.l
-; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1164-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s9, v0
-; GFX1164-TRUE16-NEXT:    v_and_or_b32 v1, v2, s10, v0
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX1164-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v0
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -10821,32 +10795,31 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1164-FAKE16-NEXT:    s_lshl_b32 s2, 0xffff, s9
 ; GFX1164-FAKE16-NEXT:    s_not_b32 s10, s2
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1164-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-FAKE16-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX1164-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
 ; GFX1164-FAKE16-NEXT:    v_add_f16_e32 v0, s8, v0
-; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1164-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s9, v0
-; GFX1164-FAKE16-NEXT:    v_and_or_b32 v1, v2, s10, v0
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-FAKE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX1164-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v0
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -10869,32 +10842,30 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1132-TRUE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
 ; GFX1132-TRUE16-NEXT:    s_not_b32 s3, s3
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1132-TRUE16-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX1132-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
 ; GFX1132-TRUE16-NEXT:    v_add_f16_e32 v0.l, s8, v0.l
-; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1132-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
-; GFX1132-TRUE16-NEXT:    v_and_or_b32 v1, v2, s3, v0
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-TRUE16-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
 ; GFX1132-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX1132-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -10917,32 +10888,30 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1132-FAKE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
 ; GFX1132-FAKE16-NEXT:    s_not_b32 s3, s3
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1132-FAKE16-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX1132-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
 ; GFX1132-FAKE16-NEXT:    v_add_f16_e32 v0, s8, v0
-; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1132-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
-; GFX1132-FAKE16-NEXT:    v_and_or_b32 v1, v2, s3, v0
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-FAKE16-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
 ; GFX1132-FAKE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX1132-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -10965,32 +10934,31 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1264-TRUE16-NEXT:    s_lshl_b32 s2, 0xffff, s9
 ; GFX1264-TRUE16-NEXT:    s_not_b32 s10, s2
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1264-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1264-TRUE16-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX1264-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
 ; GFX1264-TRUE16-NEXT:    v_add_f16_e32 v0.l, s8, v0.l
-; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1264-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s9, v0
-; GFX1264-TRUE16-NEXT:    v_and_or_b32 v1, v2, s10, v0
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1264-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX1264-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v0
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -11013,32 +10981,31 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1264-FAKE16-NEXT:    s_lshl_b32 s2, 0xffff, s9
 ; GFX1264-FAKE16-NEXT:    s_not_b32 s10, s2
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1264-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1264-FAKE16-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX1264-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
 ; GFX1264-FAKE16-NEXT:    v_add_f16_e32 v0, s8, v0
-; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1264-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s9, v0
-; GFX1264-FAKE16-NEXT:    v_and_or_b32 v1, v2, s10, v0
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1264-FAKE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX1264-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v0
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -11061,32 +11028,30 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1232-TRUE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
 ; GFX1232-TRUE16-NEXT:    s_not_b32 s3, s3
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1232-TRUE16-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX1232-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
 ; GFX1232-TRUE16-NEXT:    v_add_f16_e32 v0.l, s8, v0.l
-; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1232-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
-; GFX1232-TRUE16-NEXT:    v_and_or_b32 v1, v2, s3, v0
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-TRUE16-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
 ; GFX1232-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX1232-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -11109,32 +11074,30 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1232-FAKE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
 ; GFX1232-FAKE16-NEXT:    s_not_b32 s3, s3
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1232-FAKE16-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX1232-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
 ; GFX1232-FAKE16-NEXT:    v_add_f16_e32 v0, s8, v0
-; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1232-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
-; GFX1232-FAKE16-NEXT:    v_and_or_b32 v1, v2, s3, v0
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-FAKE16-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
 ; GFX1232-FAKE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX1232-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -11160,36 +11123,35 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX7LESS-NEXT:    s_load_dword s3, s[4:5], 0x0
 ; GFX7LESS-NEXT:    v_mul_f32_e64 v0, 1.0, s6
 ; GFX7LESS-NEXT:    s_lshl_b32 s2, 0xffff, s10
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7LESS-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX7LESS-NEXT:    s_not_b32 s2, s2
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
 ; GFX7LESS-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v4
+; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
-; GFX7LESS-NEXT:    v_and_b32_e32 v1, s2, v4
-; GFX7LESS-NEXT:    v_or_b32_e32 v3, v1, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_and_b32_e32 v2, s2, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v0
+; GFX7LESS-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX7LESS-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -11208,37 +11170,37 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX8-NEXT:    s_lshl_b32 s2, s6, 16
 ; GFX8-NEXT:    s_not_b32 s3, s3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s10
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v0, s2, v0
-; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v3, s2, v3
+; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, s3, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v1, v3, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s3, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -11258,34 +11220,34 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX9-NEXT:    s_lshl_b32 s2, s6, 16
 ; GFX9-NEXT:    s_not_b32 s3, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s7
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_lshrrev_b32_sdwa v0, s10, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_add_f32_e32 v0, s2, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s11
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX9-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -11306,31 +11268,31 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1064-NEXT:    s_not_b32 s9, s2
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    v_lshrrev_b32_sdwa v0, s8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX1064-NEXT:    v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX1064-NEXT:    v_add_f32_e32 v0, s10, v0
-; GFX1064-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1064-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX1064-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX1064-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1064-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1064-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX1064-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1064-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX1064-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX1064-NEXT:    v_and_or_b32 v1, v2, s9, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1064-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1064-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1064-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -11353,31 +11315,31 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1032-NEXT:    s_not_b32 s8, s3
 ; GFX1032-NEXT:    s_mov_b32 s3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    v_lshrrev_b32_sdwa v0, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX1032-NEXT:    v_lshrrev_b32_sdwa v0, s2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX1032-NEXT:    v_add_f32_e32 v0, s9, v0
-; GFX1032-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1032-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX1032-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX1032-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1032-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1032-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc_lo
+; GFX1032-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1032-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX1032-NEXT:    v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX1032-NEXT:    v_and_or_b32 v1, v2, s8, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1032-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1032-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1032-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1032-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -11400,42 +11362,41 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1164-TRUE16-NEXT:    s_not_b32 s9, s2
 ; GFX1164-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164-TRUE16-NEXT:    .p2align 6
 ; GFX1164-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX1164-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
 ; GFX1164-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-TRUE16-NEXT:    v_add_f32_e32 v0, s10, v0
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1164-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1164-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX1164-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX1164-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1164-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1164-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX1164-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX1164-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s8, v2
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX1164-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
-; GFX1164-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s8, v1
-; GFX1164-TRUE16-NEXT:    v_and_or_b32 v1, v2, s9, v0
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-TRUE16-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX1164-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -11458,41 +11419,40 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1164-FAKE16-NEXT:    s_not_b32 s9, s2
 ; GFX1164-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1164-FAKE16-NEXT:    .p2align 6
 ; GFX1164-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX1164-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
 ; GFX1164-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-FAKE16-NEXT:    v_add_f32_e32 v0, s10, v0
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1164-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1164-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX1164-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1164-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s8, v0
-; GFX1164-FAKE16-NEXT:    v_and_or_b32 v1, v2, s9, v0
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-FAKE16-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX1164-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -11515,42 +11475,40 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1132-TRUE16-NEXT:    s_not_b32 s8, s3
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-TRUE16-NEXT:    .p2align 6
 ; GFX1132-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX1132-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
 ; GFX1132-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-TRUE16-NEXT:    v_add_f32_e32 v0, s9, v0
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1132-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX1132-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX1132-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1132-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc_lo
-; GFX1132-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
-; GFX1132-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1132-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX1132-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX1132-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v2
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v1
-; GFX1132-TRUE16-NEXT:    v_and_or_b32 v1, v2, s8, v0
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-TRUE16-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
 ; GFX1132-TRUE16-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX1132-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -11573,41 +11531,39 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1132-FAKE16-NEXT:    s_not_b32 s8, s3
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-FAKE16-NEXT:    .p2align 6
 ; GFX1132-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX1132-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
 ; GFX1132-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-FAKE16-NEXT:    v_add_f32_e32 v0, s9, v0
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1132-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX1132-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1132-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc_lo
 ; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
-; GFX1132-FAKE16-NEXT:    v_and_or_b32 v1, v2, s8, v0
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-FAKE16-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
 ; GFX1132-FAKE16-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX1132-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -11630,42 +11586,41 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1264-TRUE16-NEXT:    s_not_b32 s9, s2
 ; GFX1264-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1264-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX1264-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
 ; GFX1264-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-TRUE16-NEXT:    v_add_f32_e32 v0, s10, v0
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1264-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1264-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX1264-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX1264-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1264-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1264-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX1264-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX1264-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s8, v2
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX1264-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
-; GFX1264-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s8, v1
-; GFX1264-TRUE16-NEXT:    v_and_or_b32 v1, v2, s9, v0
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1264-TRUE16-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX1264-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -11688,41 +11643,40 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1264-FAKE16-NEXT:    s_not_b32 s9, s2
 ; GFX1264-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1264-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX1264-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
 ; GFX1264-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-FAKE16-NEXT:    v_add_f32_e32 v0, s10, v0
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1264-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1264-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX1264-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1264-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s8, v0
-; GFX1264-FAKE16-NEXT:    v_and_or_b32 v1, v2, s9, v0
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1264-FAKE16-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX1264-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v0
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -11745,42 +11699,40 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1232-TRUE16-NEXT:    s_not_b32 s8, s3
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX1232-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
 ; GFX1232-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232-TRUE16-NEXT:    v_add_f32_e32 v0, s9, v0
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1232-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1232-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX1232-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX1232-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1232-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc_lo
-; GFX1232-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
-; GFX1232-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX1232-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX1232-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX1232-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v2
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v1
-; GFX1232-TRUE16-NEXT:    v_and_or_b32 v1, v2, s8, v0
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-TRUE16-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
 ; GFX1232-TRUE16-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX1232-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -11803,41 +11755,39 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1232-FAKE16-NEXT:    s_not_b32 s8, s3
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX1232-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
 ; GFX1232-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232-FAKE16-NEXT:    v_add_f32_e32 v0, s9, v0
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1232-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1232-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX1232-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1232-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc_lo
 ; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
-; GFX1232-FAKE16-NEXT:    v_and_or_b32 v1, v2, s8, v0
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-FAKE16-NEXT:    s_or_b32 s3, vcc_lo, s3
 ; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
 ; GFX1232-FAKE16-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX1232-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -11919,28 +11869,28 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX8-NEXT:    s_lshr_b32 s11, s10, 16
 ; GFX8-NEXT:    s_mov_b32 s4, s2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    s_mov_b32 s5, s3
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_mov_b32_e32 v0, s11
+; GFX8-NEXT:    v_add_f16_e32 v2, s10, v1
+; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_add_f16_e32 v1, s10, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: uniform_fadd_v2f16:
@@ -11954,25 +11904,25 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
 ; GFX9-NEXT:    s_mov_b32 s4, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s3
 ; GFX9-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_pk_add_f16 v0, v1, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_pk_add_f16 v1, v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX9-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX9-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: uniform_fadd_v2f16:
@@ -11987,17 +11937,17 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1064-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX1064-NEXT:    s_mov_b32 s5, s3
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1064-NEXT:    s_mov_b32 s4, s2
 ; GFX1064-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_pk_add_f16 v0, v1, s10
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    v_pk_add_f16 v1, v2, s10
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1064-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1064-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB20_1
@@ -12005,7 +11955,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: uniform_fadd_v2f16:
@@ -12020,17 +11970,17 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1032-NEXT:    s_load_dword s4, s[2:3], 0x0
 ; GFX1032-NEXT:    s_mov_b32 s5, s3
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1032-NEXT:    s_mov_b32 s4, s2
 ; GFX1032-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_pk_add_f16 v0, v1, s8
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    v_pk_add_f16 v1, v2, s8
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1032-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1032-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB20_1
@@ -12038,7 +11988,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: uniform_fadd_v2f16:
@@ -12053,19 +12003,18 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1164-NEXT:    s_load_b32 s4, s[2:3], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s5, s3
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1164-NEXT:    s_mov_b32 s4, s2
 ; GFX1164-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_pk_add_f16 v0, v1, s10
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-NEXT:    v_pk_add_f16 v1, v2, s10
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[8:9]
@@ -12074,7 +12023,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT:    buffer_store_b32 v2, off, s[0:3], 0
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: uniform_fadd_v2f16:
@@ -12089,19 +12038,17 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1132-NEXT:    s_load_b32 s4, s[2:3], 0x0
 ; GFX1132-NEXT:    s_mov_b32 s5, s3
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1132-NEXT:    s_mov_b32 s4, s2
 ; GFX1132-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-NEXT:    v_pk_add_f16 v1, v2, s8
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-NEXT:    v_pk_add_f16 v0, v1, s8
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
@@ -12110,7 +12057,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT:    buffer_store_b32 v2, off, s[0:3], 0
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX1264-LABEL: uniform_fadd_v2f16:
@@ -12125,19 +12072,18 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1264-NEXT:    s_load_b32 s4, s[2:3], 0x0
 ; GFX1264-NEXT:    s_mov_b32 s5, s3
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1264-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1264-NEXT:    s_mov_b32 s4, s2
 ; GFX1264-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-NEXT:    v_pk_add_f16 v0, v1, s10
+; GFX1264-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1264-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-NEXT:    v_pk_add_f16 v1, v2, s10
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264-NEXT:    s_and_not1_b64 exec, exec, s[8:9]
@@ -12146,7 +12092,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-NEXT:    s_mov_b32 s2, -1
-; GFX1264-NEXT:    buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT:    buffer_store_b32 v2, off, s[0:3], null
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: uniform_fadd_v2f16:
@@ -12161,19 +12107,17 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1232-NEXT:    s_load_b32 s4, s[2:3], 0x0
 ; GFX1232-NEXT:    s_mov_b32 s5, s3
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1232-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1232-NEXT:    s_mov_b32 s4, s2
 ; GFX1232-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX1232-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1232-NEXT:    v_pk_add_f16 v1, v2, s8
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-NEXT:    v_pk_add_f16 v0, v1, s8
+; GFX1232-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-NEXT:    s_or_b32 s9, vcc_lo, s9
 ; GFX1232-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
@@ -12182,7 +12126,7 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-NEXT:    s_mov_b32 s2, -1
-; GFX1232-NEXT:    buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT:    buffer_store_b32 v2, off, s[0:3], null
 ; GFX1232-NEXT:    s_endpgm
   %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x half> %val monotonic, align 4
   store <2 x half> %rmw, ptr addrspace(1) %result
@@ -12260,41 +12204,41 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX8-NEXT:    s_mov_b32 s4, s10
 ; GFX8-NEXT:    s_mov_b32 s5, s11
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v0, s12, v0
-; GFX8-NEXT:    v_add_f32_e32 v1, s13, v1
+; GFX8-NEXT:    v_add_f32_e32 v2, s13, v2
 ; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v5, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v4, s[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s10, -1
-; GFX8-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX8-NEXT:    buffer_store_dword v2, off, s[8:11], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: uniform_fadd_v2bf16:
@@ -12311,40 +12255,40 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX9-NEXT:    s_lshl_b32 s14, s0, 16
 ; GFX9-NEXT:    s_and_b32 s15, s0, 0xffff0000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_mov_b32 s4, s10
 ; GFX9-NEXT:    s_mov_b32 s5, s11
 ; GFX9-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX9-NEXT:    v_add_f32_e32 v0, s14, v0
-; GFX9-NEXT:    v_add_f32_e32 v1, s15, v1
+; GFX9-NEXT:    v_add_f32_e32 v2, s15, v2
 ; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX9-NEXT:    v_bfe_u32 v5, v2, 16, 1
 ; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s12
-; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s12
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_add3_u32 v5, v5, v2, s12
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v1, v0, s13
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s13
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT:    buffer_store_dword v2, off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: uniform_fadd_v2bf16:
@@ -12362,30 +12306,30 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1064-NEXT:    s_mov_b32 s4, s10
 ; GFX1064-NEXT:    s_mov_b32 s5, s11
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1064-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX1064-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1064-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX1064-NEXT:    v_add_f32_e32 v0, s12, v0
-; GFX1064-NEXT:    v_add_f32_e32 v1, s13, v1
+; GFX1064-NEXT:    v_add_f32_e32 v2, s13, v2
 ; GFX1064-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1064-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX1064-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX1064-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1064-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX1064-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX1064-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1064-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX1064-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1064-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX1064-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX1064-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc
+; GFX1064-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s[0:1]
-; GFX1064-NEXT:    v_perm_b32 v1, v1, v0, 0x7060302
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1064-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1064-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1064-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB21_1
@@ -12393,7 +12337,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX1064-NEXT:    buffer_store_dword v2, off, s[8:11], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: uniform_fadd_v2bf16:
@@ -12410,31 +12354,31 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1032-NEXT:    s_and_b32 s3, s0, 0xffff0000
 ; GFX1032-NEXT:    s_mov_b32 s5, s11
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1032-NEXT:    s_mov_b32 s4, s10
 ; GFX1032-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX1032-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1032-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX1032-NEXT:    v_add_f32_e32 v0, s2, v0
-; GFX1032-NEXT:    v_add_f32_e32 v1, s3, v1
+; GFX1032-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1032-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1032-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX1032-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX1032-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1032-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX1032-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1032-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1032-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX1032-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1032-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX1032-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX1032-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
-; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX1032-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s0
-; GFX1032-NEXT:    v_perm_b32 v1, v1, v0, 0x7060302
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1032-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; GFX1032-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1032-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1032-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
 ; GFX1032-NEXT:    s_cbranch_execnz .LBB21_1
@@ -12442,7 +12386,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX1032-NEXT:    buffer_store_dword v2, off, s[8:11], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-TRUE16-LABEL: uniform_fadd_v2bf16:
@@ -12459,40 +12403,40 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1164-TRUE16-NEXT:    s_lshl_b32 s11, s4, 16
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s4, s2
 ; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s5, s3
 ; GFX1164-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-TRUE16-NEXT:    .p2align 6
 ; GFX1164-TRUE16-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX1164-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX1164-TRUE16-NEXT:    v_add_f32_e32 v0, s11, v0
-; GFX1164-TRUE16-NEXT:    v_add_f32_e32 v1, s10, v1
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_add_f32_e32 v2, s10, v2
 ; GFX1164-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1164-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX1164-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX1164-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1164-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX1164-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; GFX1164-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1164-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX1164-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc
-; GFX1164-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX1164-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX1164-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc
-; GFX1164-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v0, v1
-; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX1164-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-TRUE16-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[8:9]
@@ -12502,7 +12446,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX1164-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-TRUE16-NEXT:    buffer_store_b32 v2, off, s[0:3], 0
 ; GFX1164-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1164-FAKE16-LABEL: uniform_fadd_v2bf16:
@@ -12520,37 +12464,37 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s4, s10
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s5, s11
 ; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1164-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1164-FAKE16-NEXT:    .p2align 6
 ; GFX1164-FAKE16-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX1164-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX1164-FAKE16-NEXT:    v_add_f32_e32 v0, s12, v0
-; GFX1164-FAKE16-NEXT:    v_add_f32_e32 v1, s13, v1
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_add_f32_e32 v2, s13, v2
 ; GFX1164-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1164-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX1164-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1164-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX1164-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1164-FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX1164-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX1164-FAKE16-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc
+; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
 ; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s[0:1]
-; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-FAKE16-NEXT:    v_perm_b32 v1, v1, v0, 0x7060302
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1164-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -12560,7 +12504,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1164-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1164-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
+; GFX1164-FAKE16-NEXT:    buffer_store_b32 v2, off, s[8:11], 0
 ; GFX1164-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1132-TRUE16-LABEL: uniform_fadd_v2bf16:
@@ -12577,39 +12521,39 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1132-TRUE16-NEXT:    s_lshl_b32 s10, s4, 16
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s4, s2
 ; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s5, s3
 ; GFX1132-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-TRUE16-NEXT:    .p2align 6
 ; GFX1132-TRUE16-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX1132-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_dual_add_f32 v1, s9, v1 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX1132-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX1132-TRUE16-NEXT:    v_add_f32_e32 v0, s10, v0
-; GFX1132-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX1132-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-TRUE16-NEXT:    v_add_f32_e32 v2, s9, v2
 ; GFX1132-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX1132-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX1132-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1132-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; GFX1132-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1132-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1132-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1132-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX1132-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX1132-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v0, v1
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX1132-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-TRUE16-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s8
@@ -12619,7 +12563,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX1132-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-TRUE16-NEXT:    buffer_store_b32 v2, off, s[0:3], 0
 ; GFX1132-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1132-FAKE16-LABEL: uniform_fadd_v2bf16:
@@ -12636,38 +12580,37 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1132-FAKE16-NEXT:    s_and_b32 s3, s0, 0xffff0000
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s5, s11
 ; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s4, s10
 ; GFX1132-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX1132-FAKE16-NEXT:    .p2align 6
 ; GFX1132-FAKE16-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX1132-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_dual_add_f32 v1, s3, v1 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX1132-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX1132-FAKE16-NEXT:    v_add_f32_e32 v0, s2, v0
-; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX1132-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1132-FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-FAKE16-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1132-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1132-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
-; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1132-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX1132-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1132-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1132-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s0
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-FAKE16-NEXT:    v_perm_b32 v1, v1, v0, 0x7060302
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
 ; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1132-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -12677,7 +12620,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1132-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1132-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
+; GFX1132-FAKE16-NEXT:    buffer_store_b32 v2, off, s[8:11], 0
 ; GFX1132-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1264-TRUE16-LABEL: uniform_fadd_v2bf16:
@@ -12694,39 +12637,39 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1264-TRUE16-NEXT:    s_lshl_b32 s11, s4, 16
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s4, s2
 ; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s5, s3
 ; GFX1264-TRUE16-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX1264-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX1264-TRUE16-NEXT:    v_add_f32_e32 v0, s11, v0
-; GFX1264-TRUE16-NEXT:    v_add_f32_e32 v1, s10, v1
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_add_f32_e32 v2, s10, v2
 ; GFX1264-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1264-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX1264-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX1264-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1264-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX1264-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; GFX1264-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1264-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX1264-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1264-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc
-; GFX1264-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX1264-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX1264-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX1264-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc
-; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v0, v1
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-TRUE16-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[8:9]
@@ -12735,7 +12678,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX1264-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-TRUE16-NEXT:    buffer_store_b32 v2, off, s[0:3], null
 ; GFX1264-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1264-FAKE16-LABEL: uniform_fadd_v2bf16:
@@ -12753,37 +12696,37 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s4, s10
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s5, s11
 ; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1264-FAKE16-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX1264-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX1264-FAKE16-NEXT:    v_add_f32_e32 v0, s12, v0
-; GFX1264-FAKE16-NEXT:    v_add_f32_e32 v1, s13, v1
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_add_f32_e32 v2, s13, v2
 ; GFX1264-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1264-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX1264-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1264-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX1264-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1264-FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX1264-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX1264-FAKE16-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
 ; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc
+; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
 ; GFX1264-FAKE16-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s[0:1]
-; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-FAKE16-NEXT:    v_perm_b32 v1, v1, v0, 0x7060302
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1264-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1264-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
@@ -12792,7 +12735,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1264-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX1264-FAKE16-NEXT:    buffer_store_b32 v2, off, s[8:11], null
 ; GFX1264-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1232-TRUE16-LABEL: uniform_fadd_v2bf16:
@@ -12809,39 +12752,38 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1232-TRUE16-NEXT:    s_lshl_b32 s10, s4, 16
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s4, s2
 ; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s5, s3
 ; GFX1232-TRUE16-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX1232-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_dual_add_f32 v1, s9, v1 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX1232-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX1232-TRUE16-NEXT:    v_add_f32_e32 v0, s10, v0
-; GFX1232-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX1232-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232-TRUE16-NEXT:    v_add_f32_e32 v2, s9, v2
 ; GFX1232-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1232-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX1232-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX1232-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1232-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; GFX1232-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1232-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1232-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1232-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX1232-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX1232-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX1232-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1232-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v0, v1
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-TRUE16-NEXT:    s_or_b32 s8, vcc_lo, s8
 ; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s8
@@ -12850,7 +12792,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX1232-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-TRUE16-NEXT:    buffer_store_b32 v2, off, s[0:3], null
 ; GFX1232-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1232-FAKE16-LABEL: uniform_fadd_v2bf16:
@@ -12867,38 +12809,37 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1232-FAKE16-NEXT:    s_and_b32 s3, s0, 0xffff0000
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s5, s11
 ; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s4, s10
 ; GFX1232-FAKE16-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX1232-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_dual_add_f32 v1, s3, v1 :: v_dual_lshlrev_b32 v0, 16, v2
-; GFX1232-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX1232-FAKE16-NEXT:    v_add_f32_e32 v0, s2, v0
-; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX1232-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1232-FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232-FAKE16-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1232-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1232-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1232-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX1232-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1232-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX1232-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
 ; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX1232-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX1232-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s0
-; GFX1232-FAKE16-NEXT:    v_perm_b32 v1, v1, v0, 0x7060302
-; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
-; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v2
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX1232-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1232-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
@@ -12907,7 +12848,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1232-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1232-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX1232-FAKE16-NEXT:    buffer_store_b32 v2, off, s[8:11], null
 ; GFX1232-FAKE16-NEXT:    s_endpgm
   %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x bfloat> %val monotonic, align 4
   store <2 x bfloat> %rmw, ptr addrspace(1) %result

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 509ba295ea7f7..ae90cfb631e8d 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -32,12 +32,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.1.bb103:
-  ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.2(0x40000000)
+  ; GFX90A-NEXT:   successors: %bb.58(0x40000000), %bb.2(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.59, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.58, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
@@ -51,14 +51,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.3.Flow17:
-  ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.58(0x40000000)
+  ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.57(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr15 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr17 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.58, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.57, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.4.bb15:
   ; GFX90A-NEXT:   successors: %bb.35(0x40000000), %bb.5(0x40000000)
@@ -122,12 +122,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr24 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.7.Flow19:
-  ; GFX90A-NEXT:   successors: %bb.63(0x40000000), %bb.8(0x40000000)
+  ; GFX90A-NEXT:   successors: %bb.62(0x40000000), %bb.8(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.63, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.62, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.8.Flow32:
   ; GFX90A-NEXT:   successors: %bb.9(0x40000000), %bb.10(0x40000000)
@@ -506,8 +506,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   S_BRANCH %bb.38
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.41.bb41:
-  ; GFX90A-NEXT:   successors: %bb.47(0x40000000), %bb.42(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr62_sgpr63, $sgpr66_sgpr67
+  ; GFX90A-NEXT:   successors: %bb.46(0x40000000), %bb.42(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
@@ -533,17 +533,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.47, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.46, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.42.Flow24:
   ; GFX90A-NEXT:   successors: %bb.40(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
@@ -556,8 +556,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   S_BRANCH %bb.40
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.43.bb55:
-  ; GFX90A-NEXT:   successors: %bb.49(0x40000000), %bb.44(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
+  ; GFX90A-NEXT:   successors: %bb.48(0x40000000), %bb.44(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
@@ -565,19 +565,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.49, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.48, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.44:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr56, $vgpr18, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $vgpr61, $vgpr58, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr3, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.45.Flow26:
-  ; GFX90A-NEXT:   successors: %bb.46(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr57, $vgpr56, $vgpr18, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr3, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63, $vgpr58, $vgpr61
-  ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -592,8 +586,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.46.Flow26:
-  ; GFX90A-NEXT:   successors: %bb.48(0x80000000)
+  ; GFX90A-NEXT: bb.45.Flow26:
+  ; GFX90A-NEXT:   successors: %bb.47(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -601,16 +595,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.48
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_BRANCH %bb.47
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.47.bb48:
-  ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.48(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr62_sgpr63, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr58_sgpr59, $sgpr56_sgpr57
+  ; GFX90A-NEXT: bb.46.bb48:
+  ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.47(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
@@ -619,7 +613,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i51)
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
@@ -640,56 +634,53 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.43, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.48.Flow25:
+  ; GFX90A-NEXT: bb.47.Flow25:
   ; GFX90A-NEXT:   successors: %bb.42(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.42
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.49.bb63:
-  ; GFX90A-NEXT:   successors: %bb.51(0x40000000), %bb.50(0x40000000)
-  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
+  ; GFX90A-NEXT: bb.48.bb63:
+  ; GFX90A-NEXT:   successors: %bb.50(0x40000000), %bb.49(0x40000000)
+  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.51, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.50, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.50:
-  ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57
+  ; GFX90A-NEXT: bb.49:
+  ; GFX90A-NEXT:   successors: %bb.44(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_BRANCH %bb.45
+  ; GFX90A-NEXT:   S_BRANCH %bb.44
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.51.bb68:
-  ; GFX90A-NEXT:   successors: %bb.55(0x40000000), %bb.52(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr58_sgpr59, $sgpr56_sgpr57
+  ; GFX90A-NEXT: bb.50.bb68:
+  ; GFX90A-NEXT:   successors: %bb.54(0x40000000), %bb.51(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.55, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.54, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.52:
-  ; GFX90A-NEXT:   successors: %bb.46(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr56_sgpr57
+  ; GFX90A-NEXT: bb.51:
+  ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -701,20 +692,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
-  ; GFX90A-NEXT:   S_BRANCH %bb.46
+  ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.53.bb80:
-  ; GFX90A-NEXT:   successors: %bb.60(0x40000000), %bb.54(0x40000000)
+  ; GFX90A-NEXT: bb.52.bb80:
+  ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.53(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.60, implicit killed $scc
+  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.59, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.54:
-  ; GFX90A-NEXT:   successors: %bb.62(0x80000000)
+  ; GFX90A-NEXT: bb.53:
+  ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
@@ -729,18 +720,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
-  ; GFX90A-NEXT:   S_BRANCH %bb.62
+  ; GFX90A-NEXT:   S_BRANCH %bb.61
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.55.bb73:
-  ; GFX90A-NEXT:   successors: %bb.53(0x40000000), %bb.56(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.54.bb73:
+  ; GFX90A-NEXT:   successors: %bb.52(0x40000000), %bb.55(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr6 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76)
   ; GFX90A-NEXT:   renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
@@ -755,17 +745,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.53, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.52, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.56.Flow29:
-  ; GFX90A-NEXT:   successors: %bb.46(0x80000000)
+  ; GFX90A-NEXT: bb.55.Flow29:
+  ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.46
+  ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.57.bb90:
-  ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
+  ; GFX90A-NEXT: bb.56.bb90:
+  ; GFX90A-NEXT:   successors: %bb.60(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
@@ -783,9 +773,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
-  ; GFX90A-NEXT:   S_BRANCH %bb.61
+  ; GFX90A-NEXT:   S_BRANCH %bb.60
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.58:
+  ; GFX90A-NEXT: bb.57:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
@@ -820,7 +810,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.7
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.59.bb105:
+  ; GFX90A-NEXT: bb.58.bb105:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
@@ -837,8 +827,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
   ; GFX90A-NEXT:   S_BRANCH %bb.3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.60.bb85:
-  ; GFX90A-NEXT:   successors: %bb.57(0x40000000), %bb.61(0x40000000)
+  ; GFX90A-NEXT: bb.59.bb85:
+  ; GFX90A-NEXT:   successors: %bb.56(0x40000000), %bb.60(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec
@@ -856,17 +846,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.57, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.56, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.61.Flow31:
-  ; GFX90A-NEXT:   successors: %bb.62(0x80000000)
+  ; GFX90A-NEXT: bb.60.Flow31:
+  ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.62.Flow30:
-  ; GFX90A-NEXT:   successors: %bb.56(0x80000000)
+  ; GFX90A-NEXT: bb.61.Flow30:
+  ; GFX90A-NEXT:   successors: %bb.55(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -875,48 +865,48 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr52_sgpr53, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.56
+  ; GFX90A-NEXT:   S_BRANCH %bb.55
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.63.bb140:
-  ; GFX90A-NEXT:   successors: %bb.69(0x40000000), %bb.64(0x40000000)
+  ; GFX90A-NEXT: bb.62.bb140:
+  ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.63(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.69, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.68, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.64.Flow13:
-  ; GFX90A-NEXT:   successors: %bb.65(0x40000000), %bb.67(0x40000000)
+  ; GFX90A-NEXT: bb.63.Flow13:
+  ; GFX90A-NEXT:   successors: %bb.64(0x40000000), %bb.66(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.67, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.66, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.65.bb159:
-  ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.66(0x40000000)
+  ; GFX90A-NEXT: bb.64.bb159:
+  ; GFX90A-NEXT:   successors: %bb.67(0x40000000), %bb.65(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.68, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.67, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.66.Flow10:
-  ; GFX90A-NEXT:   successors: %bb.67(0x80000000)
+  ; GFX90A-NEXT: bb.65.Flow10:
+  ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.67.Flow14:
+  ; GFX90A-NEXT: bb.66.Flow14:
   ; GFX90A-NEXT:   successors: %bb.8(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = COPY $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.8
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.68.bb161:
-  ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
+  ; GFX90A-NEXT: bb.67.bb161:
+  ; GFX90A-NEXT:   successors: %bb.65(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec
@@ -932,10 +922,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
-  ; GFX90A-NEXT:   S_BRANCH %bb.66
+  ; GFX90A-NEXT:   S_BRANCH %bb.65
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.69.bb174:
-  ; GFX90A-NEXT:   successors: %bb.73(0x40000000), %bb.70(0x40000000)
+  ; GFX90A-NEXT: bb.68.bb174:
+  ; GFX90A-NEXT:   successors: %bb.72(0x40000000), %bb.69(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec
@@ -948,17 +938,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.73, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.70.Flow:
-  ; GFX90A-NEXT:   successors: %bb.71(0x40000000), %bb.72(0x40000000)
+  ; GFX90A-NEXT: bb.69.Flow:
+  ; GFX90A-NEXT:   successors: %bb.70(0x40000000), %bb.71(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.71, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.71.bb186:
-  ; GFX90A-NEXT:   successors: %bb.72(0x80000000)
+  ; GFX90A-NEXT: bb.70.bb186:
+  ; GFX90A-NEXT:   successors: %bb.71(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec
@@ -986,15 +976,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.72.Flow9:
-  ; GFX90A-NEXT:   successors: %bb.64(0x80000000)
+  ; GFX90A-NEXT: bb.71.Flow9:
+  ; GFX90A-NEXT:   successors: %bb.63(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_BRANCH %bb.64
+  ; GFX90A-NEXT:   S_BRANCH %bb.63
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.73.bb196:
-  ; GFX90A-NEXT:   successors: %bb.70(0x80000000)
+  ; GFX90A-NEXT: bb.72.bb196:
+  ; GFX90A-NEXT:   successors: %bb.69(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec
@@ -1002,7 +992,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_BRANCH %bb.70
+  ; GFX90A-NEXT:   S_BRANCH %bb.69
 bb:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i11 = icmp eq i32 %i, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 36370361b677d..348862d4d8ced 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -248,20 +248,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
@@ -291,22 +291,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -317,22 +317,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s20
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -343,23 +343,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s20
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v1
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1038,20 +1038,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
@@ -1063,21 +1063,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_add_f32_e32 v2, v3, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1088,22 +1088,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1114,22 +1114,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1140,22 +1140,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s20
-; GFX7-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX7-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1166,23 +1166,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s20
-; GFX6-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX6-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX6-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v0
+; GFX6-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v4, v1
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2137,20 +2137,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_add_f64_e32 v[2:3], v[4:5], v[0:1]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[0:1]
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2178,21 +2178,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[0:1]
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2208,23 +2208,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10-NEXT:    v_mov_b32_e32 v9, v2
+; GFX10-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v10, v5
+; GFX10-NEXT:    v_mov_b32_e32 v9, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[0:1]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v7
-; GFX10-NEXT:    v_mov_b32_e32 v3, v8
-; GFX10-NEXT:    v_mov_b32_e32 v4, v9
-; GFX10-NEXT:    v_mov_b32_e32 v5, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v8, v3
+; GFX10-NEXT:    v_mov_b32_e32 v7, v2
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v4, v7
+; GFX10-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
@@ -2245,25 +2245,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s20
-; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v3
-; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
-; GFX908-NEXT:    v_mov_b32_e32 v4, v9
-; GFX908-NEXT:    v_mov_b32_e32 v5, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v5
+; GFX908-NEXT:    v_mov_b32_e32 v9, v4
+; GFX908-NEXT:    v_mov_b32_e32 v8, v3
+; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2274,25 +2274,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
-; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v3
-; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
-; GFX8-NEXT:    v_mov_b32_e32 v4, v9
-; GFX8-NEXT:    v_mov_b32_e32 v5, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v5
+; GFX8-NEXT:    v_mov_b32_e32 v9, v4
+; GFX8-NEXT:    v_mov_b32_e32 v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2303,25 +2303,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v3
-; GFX7-NEXT:    v_mov_b32_e32 v9, v2
-; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v7
-; GFX7-NEXT:    v_mov_b32_e32 v3, v8
-; GFX7-NEXT:    v_mov_b32_e32 v4, v9
-; GFX7-NEXT:    v_mov_b32_e32 v5, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v10, v5
+; GFX7-NEXT:    v_mov_b32_e32 v9, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2332,26 +2332,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s20
-; GFX6-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
+; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v3
-; GFX6-NEXT:    v_mov_b32_e32 v9, v2
-; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[0:1]
+; GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, v7
-; GFX6-NEXT:    v_mov_b32_e32 v3, v8
-; GFX6-NEXT:    v_mov_b32_e32 v4, v9
-; GFX6-NEXT:    v_mov_b32_e32 v5, v10
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v10, v5
+; GFX6-NEXT:    v_mov_b32_e32 v9, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3430,41 +3430,41 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX12-TRUE16-NEXT:    v_add_f16_e32 v1.l, v1.l, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3479,41 +3479,41 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX12-FAKE16-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3523,7 +3523,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
@@ -3532,23 +3532,23 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
+; GFX942-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
 ; GFX942-NEXT:    v_add_f16_e32 v2, v2, v0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_and_or_b32 v2, v3, s7, v2
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s6, v4
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3557,40 +3557,40 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v1.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3599,40 +3599,40 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3640,35 +3640,35 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v2, v5, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX10-NEXT:    v_add_f16_e32 v1, v1, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3677,7 +3677,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
@@ -3686,22 +3686,22 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
 ; GFX90A-NEXT:    v_add_f16_e32 v2, v2, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3709,8 +3709,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v5, s4
+; GFX908-NEXT:    buffer_load_dword v2, v5, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
@@ -3719,23 +3719,23 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
 ; GFX908-NEXT:    v_add_f16_e32 v1, v1, v0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3743,8 +3743,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    buffer_load_dword v2, v5, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
@@ -3753,24 +3753,24 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
 ; GFX8-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, s7, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, s7, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3778,38 +3778,38 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3818,39 +3818,39 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -3877,28 +3877,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX12-TRUE16-NEXT:    v_add_f16_e32 v1.l, v1.l, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3925,28 +3925,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX12-FAKE16-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3962,7 +3962,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
@@ -3971,18 +3971,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX942-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
+; GFX942-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
 ; GFX942-NEXT:    v_add_f16_e32 v2, v2, v0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_and_or_b32 v2, v3, s7, v2
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4000,28 +4000,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v1.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4041,28 +4041,28 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4080,25 +4080,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v2, v3, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX10-NEXT:    v_add_f16_e32 v1, v1, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
@@ -4112,7 +4112,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
@@ -4121,17 +4121,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v3
 ; GFX90A-NEXT:    v_add_f16_e32 v2, v2, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4144,7 +4144,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    buffer_load_dword v2, v3, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
@@ -4153,18 +4153,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
+; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
 ; GFX908-NEXT:    v_add_f16_e32 v1, v1, v0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX908-NEXT:    v_mov_b32_e32 v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4177,7 +4177,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    buffer_load_dword v2, v3, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
@@ -4186,19 +4186,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v2
 ; GFX8-NEXT:    v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, s7, v5
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4210,33 +4210,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4248,34 +4248,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4299,11 +4299,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, -4, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, -4, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v10, v7
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v11, v7
 ; GFX12-TRUE16-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4317,7 +4317,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v6, v9, s[4:7], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2:
@@ -4327,19 +4327,17 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v8
 ; GFX12-TRUE16-NEXT:    v_add_f16_e32 v6.l, v6.l, v5.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v7, v8, v10, v6
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
 ; GFX12-TRUE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX12-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -4354,13 +4352,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX12-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -4368,7 +4367,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v6
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4383,11 +4382,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v9, -4, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v10, -4, v6
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-FAKE16-NEXT:    v_not_b32_e32 v10, v7
+; GFX12-FAKE16-NEXT:    v_not_b32_e32 v11, v7
 ; GFX12-FAKE16-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4401,7 +4400,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v6, v9, s[4:7], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2:
@@ -4411,19 +4410,17 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v8
 ; GFX12-FAKE16-NEXT:    v_add_f16_e32 v6, v6, v5
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v7, v8, v10, v6
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v7
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
 ; GFX12-FAKE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX12-FAKE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -4438,13 +4435,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX12-FAKE16-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -4452,7 +4450,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-FAKE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v6
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4477,7 +4475,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v6, v10, s[4:7], 0 offen
+; GFX942-NEXT:    buffer_load_dword v7, v10, s[4:7], 0 offen
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX942-NEXT:  ; %bb.2:
@@ -4487,13 +4485,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v9, v6
-; GFX942-NEXT:    v_lshrrev_b32_e32 v6, v4, v9
+; GFX942-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
 ; GFX942-NEXT:    v_add_f16_e32 v6, v6, v5
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX942-NEXT:    v_and_or_b32 v8, v9, v11, v6
+; GFX942-NEXT:    v_and_or_b32 v6, v7, v11, v6
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
 ; GFX942-NEXT:    buffer_wbl2 sc1
 ; GFX942-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX942-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -4507,20 +4504,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX942-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX942-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX942-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_lshrrev_b32_e32 v0, v4, v6
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4531,11 +4529,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, -4, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, -4, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v10, v7
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v11, v7
 ; GFX11-TRUE16-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4547,7 +4545,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v6, v9, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v7, v10, s[4:7], 0 offen
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2:
@@ -4557,19 +4555,17 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v8
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v6.l, v6.l, v5.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v7, v8, v10, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
 ; GFX11-TRUE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX11-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -4583,13 +4579,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX11-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -4598,7 +4595,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX11-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4609,11 +4606,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, -4, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, -4, v6
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-FAKE16-NEXT:    v_not_b32_e32 v10, v7
+; GFX11-FAKE16-NEXT:    v_not_b32_e32 v11, v7
 ; GFX11-FAKE16-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4625,7 +4622,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v6, v9, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v7, v10, s[4:7], 0 offen
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2:
@@ -4635,19 +4632,17 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v8
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v6, v6, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v7, v8, v10, v6
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v7
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
 ; GFX11-FAKE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX11-FAKE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -4661,13 +4656,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX11-FAKE16-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -4676,7 +4672,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX11-FAKE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4686,10 +4682,10 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    v_and_b32_e32 v4, 3, v6
-; GFX10-NEXT:    v_and_b32_e32 v9, -4, v6
+; GFX10-NEXT:    v_and_b32_e32 v10, -4, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX10-NEXT:    v_not_b32_e32 v10, v7
+; GFX10-NEXT:    v_not_b32_e32 v11, v7
 ; GFX10-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4699,7 +4695,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v6, v9, s[8:11], 0 offen
+; GFX10-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
@@ -4709,15 +4705,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, v4, v8
 ; GFX10-NEXT:    v_add_f16_e32 v6, v6, v5
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v7, v8, v10, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, v7
-; GFX10-NEXT:    v_mov_b32_e32 v7, v8
+; GFX10-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX10-NEXT:    v_mov_b32_e32 v9, v7
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX10-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -4729,14 +4724,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX10-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -4745,7 +4741,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v4, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4769,7 +4765,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v6, v10, s[8:11], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2:
@@ -4779,13 +4775,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v4, v9
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
 ; GFX90A-NEXT:    v_add_f16_e32 v6, v6, v5
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX90A-NEXT:    v_and_or_b32 v8, v9, v11, v6
+; GFX90A-NEXT:    v_and_or_b32 v6, v7, v11, v6
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -4797,32 +4792,33 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v4, v6
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT:    v_and_b32_e32 v9, -4, v4
+; GFX908-NEXT:    v_and_b32_e32 v10, -4, v4
 ; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX908-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX908-NEXT:    v_not_b32_e32 v10, v6
+; GFX908-NEXT:    v_not_b32_e32 v11, v6
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -4834,7 +4830,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v6, v9, s[8:11], 0 offen
+; GFX908-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -4844,14 +4840,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v8, v6
-; GFX908-NEXT:    v_lshrrev_b32_e32 v6, v4, v8
+; GFX908-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
 ; GFX908-NEXT:    v_add_f16_e32 v6, v6, v5
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX908-NEXT:    v_and_or_b32 v7, v8, v10, v6
-; GFX908-NEXT:    v_mov_b32_e32 v6, v7
+; GFX908-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX908-NEXT:    v_mov_b32_e32 v9, v7
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_mov_b32_e32 v7, v8
+; GFX908-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX908-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -4863,32 +4858,33 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v4, v6
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, -4, v4
+; GFX8-NEXT:    v_and_b32_e32 v10, -4, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v4, s4
-; GFX8-NEXT:    v_not_b32_e32 v10, v6
+; GFX8-NEXT:    v_not_b32_e32 v11, v6
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -4900,7 +4896,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v6, v9, s[8:11], 0 offen
+; GFX8-NEXT:    buffer_load_dword v7, v10, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -4910,15 +4906,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v8, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, v4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
 ; GFX8-NEXT:    v_add_f16_e32 v6, v6, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, v8, v10
-; GFX8-NEXT:    v_or_b32_e32 v7, v7, v6
-; GFX8-NEXT:    v_mov_b32_e32 v6, v7
+; GFX8-NEXT:    v_and_b32_e32 v8, v7, v11
+; GFX8-NEXT:    v_or_b32_e32 v6, v8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v9, v7
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_mov_b32_e32 v7, v8
+; GFX8-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX8-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -4930,20 +4925,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v4, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4965,19 +4961,18 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
 ; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
@@ -5007,6 +5002,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
@@ -5035,20 +5031,18 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
 ; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
@@ -5078,6 +5072,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
@@ -5105,55 +5100,55 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5165,54 +5160,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5221,36 +5216,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX942-NEXT:    s_not_b32 s7, s4
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX942-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX942-NEXT:    v_add3_u32 v2, v2, v0, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5262,147 +5257,147 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5410,34 +5405,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX90A-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5450,40 +5445,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX908-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX908-NEXT:    v_add3_u32 v2, v2, v1, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5491,42 +5486,42 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v1
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v4, s7, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5534,38 +5529,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5574,39 +5569,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -5624,48 +5619,48 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5683,47 +5678,47 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5738,36 +5733,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX942-NEXT:    s_not_b32 s7, s4
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX942-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX942-NEXT:    v_add3_u32 v4, v4, v0, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5778,46 +5773,46 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5830,45 +5825,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5881,36 +5876,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -5923,34 +5918,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5962,35 +5957,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX908-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v4, v4, v0, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX908-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX908-NEXT:    v_add3_u32 v2, v2, v1, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6002,37 +5997,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v1
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v4, s7, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6044,33 +6039,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6082,34 +6077,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6151,7 +6146,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2:
@@ -6162,27 +6157,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX12-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-TRUE16-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -6206,6 +6200,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -6246,7 +6241,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2:
@@ -6257,26 +6252,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX12-FAKE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-FAKE16-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -6300,6 +6294,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -6332,7 +6327,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v4, v9, s[4:7], 0 offen
+; GFX942-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX942-NEXT:  ; %bb.2:
@@ -6344,7 +6339,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX942-NEXT:    v_add_f32_e32 v4, v4, v11
@@ -6378,6 +6372,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB18_3
@@ -6410,7 +6405,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2:
@@ -6422,26 +6417,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-TRUE16-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -6464,6 +6458,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -6500,7 +6495,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2:
@@ -6512,25 +6507,24 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX11-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, v4, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-FAKE16-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -6553,6 +6547,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -6585,7 +6580,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
@@ -6596,10 +6591,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v10
 ; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v4
@@ -6629,6 +6623,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -6661,7 +6656,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v4, v9, s[8:11], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
@@ -6673,7 +6668,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v11
 ; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
@@ -6704,6 +6698,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
@@ -6733,7 +6728,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -6745,7 +6740,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_add_f32_e32 v4, v4, v10
 ; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
@@ -6777,6 +6771,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
@@ -6806,7 +6801,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -6817,7 +6812,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v10
 ; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
@@ -6851,6 +6845,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
@@ -6878,19 +6873,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
 ; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
@@ -6920,6 +6914,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
@@ -6948,20 +6943,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
 ; GFX6-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX6-NEXT:    ; Child Loop BB18_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_add_f32_e32 v4, v4, v10
@@ -6991,6 +6984,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB18_3
@@ -7291,20 +7285,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v4, v5, v0
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -7320,20 +7315,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_add_f16 v4, v5, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
@@ -7363,24 +7358,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v2, v5, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8282,20 +8277,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v4, v5, v0
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8311,20 +8307,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_add_f16 v4, v5, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
@@ -8336,21 +8332,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8361,22 +8357,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_pk_add_f16 v4, v5, v0
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8387,24 +8383,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v2, v5, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8808,20 +8804,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_add_f16 v4, v5, v0
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8837,20 +8834,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_add_f16 v4, v5, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
@@ -8862,21 +8859,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8887,22 +8884,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
-; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_pk_add_f16 v4, v5, v0
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_pk_add_f16 v1, v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v5, v2
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8913,24 +8910,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
-; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_add_f16_sdwa v1, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v2, v5, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9496,40 +9493,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX942-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX942-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v2
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX942-NEXT:    v_add_f32_e32 v2, v2, v1
-; GFX942-NEXT:    v_add_f32_e32 v3, v3, v0
-; GFX942-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX942-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX942-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX942-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX942-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX942-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX942-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX942-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX942-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX942-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9539,11 +9536,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -9551,35 +9547,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v1, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -9592,11 +9587,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -9604,33 +9598,32 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -9645,38 +9638,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
-; GFX10-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    v_mov_b32_e32 v2, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
@@ -9688,39 +9681,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v1
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9734,37 +9727,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX908-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v1, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v2, v1, s9
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    v_mov_b32_e32 v2, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9778,38 +9771,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v2, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11107,40 +11100,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX942-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX942-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v2
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX942-NEXT:    v_add_f32_e32 v2, v2, v1
-; GFX942-NEXT:    v_add_f32_e32 v3, v3, v0
-; GFX942-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX942-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX942-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX942-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX942-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX942-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX942-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX942-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX942-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX942-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11150,11 +11143,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -11162,35 +11154,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-TRUE16-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v1, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -11203,11 +11194,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -11215,33 +11205,32 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-FAKE16-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -11256,38 +11245,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
-; GFX10-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    v_mov_b32_e32 v2, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
@@ -11299,39 +11288,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v1
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11345,37 +11334,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX908-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v1, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v2, v1, s9
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    v_mov_b32_e32 v2, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11389,38 +11378,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v2, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11974,40 +11963,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX942-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX942-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v2
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX942-NEXT:    v_add_f32_e32 v2, v2, v1
-; GFX942-NEXT:    v_add_f32_e32 v3, v3, v0
-; GFX942-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX942-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX942-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX942-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX942-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX942-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX942-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX942-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX942-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX942-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12017,11 +12006,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -12029,35 +12017,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX11-TRUE16-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v1, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -12070,11 +12057,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -12082,33 +12068,32 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX11-FAKE16-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -12123,38 +12108,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
-; GFX10-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    v_mov_b32_e32 v2, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
@@ -12166,39 +12151,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v1
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12212,37 +12197,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX908-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v1, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v2, v1, s9
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    v_mov_b32_e32 v2, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12256,38 +12241,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v2, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12404,40 +12389,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX942-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX942-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v2
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX942-NEXT:    v_add_f32_e32 v2, v2, v1
-; GFX942-NEXT:    v_add_f32_e32 v3, v3, v0
-; GFX942-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX942-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX942-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX942-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX942-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX942-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX942-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX942-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX942-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX942-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12447,11 +12432,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -12459,35 +12443,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v1, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -12500,11 +12483,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -12512,33 +12494,32 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -12553,38 +12534,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
-; GFX10-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    v_mov_b32_e32 v2, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
@@ -12596,39 +12577,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v1
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12642,37 +12623,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX908-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX908-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v1, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v2, v1, s9
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    v_mov_b32_e32 v2, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12686,38 +12667,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v0
-; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v2, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 3ad1e5c0b81e0..ab867b089b875 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -211,24 +211,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX942-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_max_f32_e32 v2, v5, v5
-; GFX942-NEXT:    v_max_f32_e32 v4, v2, v0
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -261,23 +261,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_max_f32_e32 v2, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -291,21 +291,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f32_e32 v2, v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_max_f32_e32 v1, v5, v5
-; GFX908-NEXT:    v_max_f32_e32 v4, v1, v0
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -319,21 +319,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v5
-; GFX8-NEXT:    v_max_f32_e32 v4, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1396,7 +1396,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s16
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
@@ -1405,18 +1405,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[9:10], v[9:10]
-; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[2:3], v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1440,7 +1439,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s16
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s4
@@ -1449,19 +1448,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX11-NEXT:    v_max_f64 v[7:8], v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1495,26 +1493,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX908-NEXT:    v_max_f64 v[7:8], v[2:3], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
-; GFX908-NEXT:    v_mov_b32_e32 v4, v9
-; GFX908-NEXT:    v_mov_b32_e32 v5, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1526,26 +1524,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX8-NEXT:    v_max_f64 v[7:8], v[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
-; GFX8-NEXT:    v_mov_b32_e32 v4, v9
-; GFX8-NEXT:    v_mov_b32_e32 v5, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2501,43 +2499,42 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2549,47 +2546,46 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v5, v0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v5
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2598,30 +2594,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX942-NEXT:    s_not_b32 s7, s4
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX942-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
-; GFX942-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX942-NEXT:    v_max_f16_e32 v2, v2, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX942-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2636,127 +2632,125 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2764,29 +2758,29 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
-; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2799,35 +2793,35 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2835,36 +2829,36 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v5
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2872,38 +2866,38 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2912,39 +2906,39 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2972,30 +2966,29 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3013,40 +3006,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v3, v0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3061,30 +3053,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX942-NEXT:    s_not_b32 s7, s4
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX942-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
-; GFX942-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX942-NEXT:    v_max_f16_e32 v2, v2, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX942-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3103,31 +3095,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3140,39 +3131,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3185,32 +3175,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
@@ -3223,29 +3213,29 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
-; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3257,30 +3247,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3292,31 +3282,31 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3328,33 +3318,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3366,34 +3356,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3446,21 +3436,18 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v9, v6
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v9, v8
 ; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v4.h, v5.l, v5.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v5.l, v4.h, v4.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v9, v5
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v7, v8, v11, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v5, v6, v11, v5
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
 ; GFX12-TRUE16-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX12-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -3475,13 +3462,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX12-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v7, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -3489,7 +3477,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v9, v6
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v9, v7
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3522,7 +3510,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2:
@@ -3533,19 +3521,17 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v10
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-FAKE16-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
@@ -3569,6 +3555,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -3601,7 +3588,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v4, v9, s[4:7], 0 offen
+; GFX942-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX942-NEXT:  ; %bb.2:
@@ -3612,7 +3599,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX942-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX942-NEXT:    v_max_f16_e32 v4, v4, v11
@@ -3641,6 +3627,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB12_3
@@ -3684,21 +3671,18 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v9, v6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v9, v8
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v4.h, v5.l, v5.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v5.l, v4.h, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v9, v5
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v7, v8, v11, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v5, v6, v11, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
 ; GFX11-TRUE16-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX11-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -3712,13 +3696,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX11-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -3727,7 +3712,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX11-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v9, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v9, v7
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -3754,7 +3739,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2:
@@ -3765,19 +3750,17 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-FAKE16-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
@@ -3800,6 +3783,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -3831,7 +3815,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
@@ -3842,10 +3826,9 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX10-NEXT:    v_max_f16_e32 v4, v4, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -3871,6 +3854,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -3903,7 +3887,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v4, v9, s[8:11], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2:
@@ -3914,7 +3898,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v11
@@ -3941,6 +3924,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
@@ -3970,7 +3954,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -3981,7 +3965,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX908-NEXT:    v_max_f16_e32 v4, v4, v10
@@ -4009,6 +3992,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
@@ -4038,7 +4022,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -4049,7 +4033,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v10
@@ -4078,6 +4061,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
@@ -4105,19 +4089,18 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
 ; GFX7-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
@@ -4147,6 +4130,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB12_3
@@ -4175,20 +4159,18 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
 ; GFX6-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX6-NEXT:    ; Child Loop BB12_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
@@ -4218,6 +4200,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB12_3
@@ -4245,55 +4228,55 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v1, v1, v0
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v0, v0, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4305,54 +4288,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v0, v0, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4361,36 +4344,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX942-NEXT:    s_not_b32 s7, s4
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_max_f32_e32 v2, v2, v0
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX942-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX942-NEXT:    v_add3_u32 v2, v2, v0, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4402,147 +4385,147 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4550,34 +4533,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX90A-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4590,40 +4573,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX908-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX908-NEXT:    v_add3_u32 v2, v2, v1, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4631,42 +4614,42 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v1
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v4, s7, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4674,39 +4657,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4715,40 +4698,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    v_max_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -4766,48 +4749,48 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v1, v1, v0
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4825,47 +4808,47 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4880,36 +4863,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX942-NEXT:    s_not_b32 s7, s4
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_max_f32_e32 v2, v2, v0
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX942-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX942-NEXT:    v_add3_u32 v4, v4, v0, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4920,46 +4903,46 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4972,45 +4955,45 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5023,36 +5006,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
@@ -5065,34 +5048,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5104,35 +5087,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX908-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v4, v4, v0, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX908-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX908-NEXT:    v_add3_u32 v2, v2, v1, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5144,37 +5127,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v1
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v4, s7, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5186,34 +5169,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5225,35 +5208,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5295,7 +5278,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2:
@@ -5306,27 +5289,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v4, v4, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX12-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-TRUE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5350,6 +5332,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -5390,7 +5373,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2:
@@ -5401,26 +5384,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v4, v4, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX12-FAKE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-FAKE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5444,6 +5426,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -5476,7 +5459,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v4, v9, s[4:7], 0 offen
+; GFX942-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX942-NEXT:  ; %bb.2:
@@ -5488,7 +5471,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX942-NEXT:    v_max_f32_e32 v4, v4, v11
@@ -5522,6 +5504,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB15_3
@@ -5554,7 +5537,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2:
@@ -5566,26 +5549,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_max_f32_e32 v4, v4, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-TRUE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5608,6 +5590,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -5644,7 +5627,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2:
@@ -5656,25 +5639,24 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX11-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_max_f32_e32 v4, v4, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-FAKE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5697,6 +5679,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -5729,7 +5712,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
@@ -5740,10 +5723,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v10
 ; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v4
@@ -5773,6 +5755,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -5805,7 +5788,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v4, v9, s[8:11], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2:
@@ -5817,7 +5800,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v11
 ; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
@@ -5848,6 +5830,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
@@ -5877,7 +5860,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -5889,7 +5872,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_max_f32_e32 v4, v4, v10
 ; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
@@ -5921,6 +5903,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
@@ -5950,7 +5933,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -5961,7 +5944,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v10
 ; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
@@ -5995,6 +5977,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
@@ -6022,19 +6005,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
 ; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -6065,6 +6047,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
@@ -6093,20 +6076,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
 ; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -6137,6 +6118,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
@@ -6486,7 +6468,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v0, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
@@ -6494,17 +6476,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v1, v5, v5
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6518,25 +6499,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX942-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_pk_max_f16 v2, v5, v5
+; GFX942-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_pk_max_f16 v4, v2, v0
+; GFX942-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6548,25 +6529,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v1, v5, v5
-; GFX11-NEXT:    v_pk_max_f16 v4, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6580,24 +6560,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_max_f16 v1, v5, v5
-; GFX10-NEXT:    v_pk_max_f16 v4, v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -6609,23 +6589,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_pk_max_f16 v2, v5, v5
-; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6639,21 +6619,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX908-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_pk_max_f16 v1, v5, v5
-; GFX908-NEXT:    v_pk_max_f16 v4, v1, v0
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6667,25 +6647,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v1, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v6, v6
-; GFX8-NEXT:    v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v2, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7951,45 +7931,43 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_add_co_i32 s4, s16, 0x400
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX12-TRUE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_dual_max_num_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v1, v1, v0
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT:    v_dual_max_num_f32 v5, v5, v2 :: v_dual_max_num_f32 v0, v0, v3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v1, v2
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8006,44 +7984,42 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX12-FAKE16-NEXT:    s_add_co_i32 s4, s16, 0x400
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_dual_max_num_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v1, v1, v3
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT:    v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8057,40 +8033,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX942-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX942-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v2
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX942-NEXT:    v_max_f32_e32 v2, v2, v1
-; GFX942-NEXT:    v_max_f32_e32 v3, v3, v0
-; GFX942-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX942-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX942-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX942-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX942-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX942-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX942-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX942-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX942-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8100,11 +8076,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -8112,35 +8087,34 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_dual_max_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_max_f32 v5, v5, v2 :: v_dual_max_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v1, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8153,11 +8127,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -8165,33 +8138,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_dual_max_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8206,38 +8178,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v0
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
-; GFX10-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    v_mov_b32_e32 v2, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
@@ -8249,39 +8221,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v1
-; GFX90A-NEXT:    v_max_f32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8295,37 +8267,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX908-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v0
-; GFX908-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v1, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v2, v1, s9
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    v_mov_b32_e32 v2, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8339,38 +8311,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v0
-; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v2, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 1f51c93d08db1..1a25904dd553f 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -211,24 +211,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX942-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_max_f32_e32 v2, v5, v5
-; GFX942-NEXT:    v_min_f32_e32 v4, v2, v0
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -261,23 +261,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_max_f32_e32 v2, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v2, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -291,21 +291,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f32_e32 v2, v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_max_f32_e32 v1, v5, v5
-; GFX908-NEXT:    v_min_f32_e32 v4, v1, v0
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -319,21 +319,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v5
-; GFX8-NEXT:    v_min_f32_e32 v4, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1396,7 +1396,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s16
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s4
@@ -1405,18 +1405,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[9:10], v[9:10]
-; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[2:3], v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1440,7 +1439,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s16
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s4
@@ -1449,19 +1448,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX11-NEXT:    v_min_f64 v[7:8], v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    v_dual_mov_b32 v4, v9 :: v_dual_mov_b32 v5, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[2:5], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1495,26 +1493,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX908-NEXT:    v_min_f64 v[7:8], v[2:3], v[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
-; GFX908-NEXT:    v_mov_b32_e32 v4, v9
-; GFX908-NEXT:    v_mov_b32_e32 v5, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1526,26 +1524,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_max_f64 v[2:3], v[9:10], v[9:10]
-; GFX8-NEXT:    v_min_f64 v[7:8], v[2:3], v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
-; GFX8-NEXT:    v_mov_b32_e32 v4, v9
-; GFX8-NEXT:    v_mov_b32_e32 v5, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2501,43 +2499,42 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v1.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2549,47 +2546,46 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v5, v0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v5
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2598,30 +2594,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX942-NEXT:    s_not_b32 s7, s4
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX942-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
-; GFX942-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX942-NEXT:    v_min_f16_e32 v2, v2, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX942-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX942-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2636,127 +2632,125 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_min_f16_e32 v1.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_min_f16_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT:    v_min_f16_e32 v1, v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2764,29 +2758,29 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
-; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2799,35 +2793,35 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT:    v_min_f16_e32 v1, v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2835,36 +2829,36 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_e32 v1, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v5
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2872,38 +2866,38 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2912,39 +2906,39 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2972,30 +2966,29 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v1.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3013,40 +3006,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v3, v0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v3
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3061,30 +3053,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX942-NEXT:    s_not_b32 s7, s4
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX942-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
-; GFX942-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX942-NEXT:    v_min_f16_e32 v2, v2, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX942-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX942-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3103,31 +3095,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_min_f16_e32 v1.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3140,39 +3131,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_min_f16_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -3185,32 +3175,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT:    v_min_f16_e32 v1, v1, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
@@ -3223,29 +3213,29 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, s6, v5
-; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, s6, v2
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX90A-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX90A-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3257,30 +3247,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX908-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX908-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX908-NEXT:    v_min_f16_e32 v1, v1, v0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3292,31 +3282,31 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_e32 v1, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3328,33 +3318,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3366,34 +3356,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3446,21 +3436,18 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v9, v6
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v9, v8
 ; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v4.h, v5.l, v5.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v5.l, v4.h, v4.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v9, v5
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v7, v8, v11, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v5, v6, v11, v5
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
 ; GFX12-TRUE16-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX12-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -3475,13 +3462,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX12-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v7, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -3489,7 +3477,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v9, v6
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v9, v7
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3522,7 +3510,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2:
@@ -3533,19 +3521,17 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v4, v4, v10
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-FAKE16-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
@@ -3569,6 +3555,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -3601,7 +3588,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v4, v9, s[4:7], 0 offen
+; GFX942-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX942-NEXT:  ; %bb.2:
@@ -3612,7 +3599,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX942-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX942-NEXT:    v_min_f16_e32 v4, v4, v11
@@ -3641,6 +3627,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB12_3
@@ -3684,21 +3671,18 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v9, v6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v9, v8
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v4.h, v5.l, v5.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_min_f16_e32 v5.l, v4.h, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v9, v5
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v7, v8, v11, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v5, v6, v11, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
 ; GFX11-TRUE16-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX11-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -3712,13 +3696,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX11-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -3727,7 +3712,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX11-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v9, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v9, v7
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -3754,7 +3739,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2:
@@ -3765,19 +3750,17 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_min_f16_e32 v4, v4, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-FAKE16-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
@@ -3800,6 +3783,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -3831,7 +3815,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
@@ -3842,10 +3826,9 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX10-NEXT:    v_min_f16_e32 v4, v4, v10
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -3871,6 +3854,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -3903,7 +3887,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v4, v9, s[8:11], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2:
@@ -3914,7 +3898,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v11
@@ -3941,6 +3924,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
@@ -3970,7 +3954,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -3981,7 +3965,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX908-NEXT:    v_min_f16_e32 v4, v4, v10
@@ -4009,6 +3992,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
@@ -4038,7 +4022,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -4049,7 +4033,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX8-NEXT:    v_min_f16_e32 v4, v4, v10
@@ -4078,6 +4061,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
@@ -4105,19 +4089,18 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
 ; GFX7-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX7-NEXT:    v_and_b32_e32 v5, v6, v9
@@ -4147,6 +4130,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB12_3
@@ -4175,20 +4159,18 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
 ; GFX6-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX6-NEXT:    ; Child Loop BB12_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_and_b32_e32 v5, v6, v9
@@ -4218,6 +4200,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB12_3
@@ -4245,55 +4228,55 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v1, v1, v0
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v0, v0, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4305,54 +4288,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v0, v0, v5
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4361,36 +4344,36 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX942-NEXT:    s_not_b32 s7, s4
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_min_f32_e32 v2, v2, v0
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX942-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX942-NEXT:    v_add3_u32 v2, v2, v0, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4402,147 +4385,147 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4550,34 +4533,34 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX90A-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX90A-NEXT:    v_add3_u32 v2, v2, v0, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4590,40 +4573,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX908-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX908-NEXT:    v_add3_u32 v2, v2, v1, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v2, v2, v0, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4631,42 +4614,42 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v1
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v4, s7, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4674,39 +4657,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4715,40 +4698,40 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    buffer_load_dword v1, v4, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    v_min_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v2, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -4766,48 +4749,48 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v1, v1, v0
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4825,47 +4808,47 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v1, v1, v0
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4880,36 +4863,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_addk_i32 s16, 0x200
 ; GFX942-NEXT:    s_and_b32 s4, s16, -4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s4
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_and_b32 s4, s16, 3
 ; GFX942-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX942-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX942-NEXT:    s_not_b32 s7, s4
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_min_f32_e32 v2, v2, v0
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX942-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX942-NEXT:    v_add3_u32 v4, v4, v0, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4920,46 +4903,46 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4972,45 +4955,45 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
 ; GFX11-FAKE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5023,36 +5006,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s20, 0x200
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    s_and_b32 s4, s20, -4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    s_and_b32 s4, s20, 3
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX10-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX10-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_and_or_b32 v4, v5, s6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
@@ -5065,34 +5048,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s20, 0x200
 ; GFX90A-NEXT:    s_and_b32 s4, s20, -4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s4
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_and_b32 s4, s20, 3
 ; GFX90A-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX90A-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX90A-NEXT:    v_add3_u32 v3, v3, v2, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v2
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX90A-NEXT:    v_add3_u32 v4, v4, v0, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5104,35 +5087,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s20, 0x200
 ; GFX908-NEXT:    s_and_b32 s4, s20, -4
-; GFX908-NEXT:    v_mov_b32_e32 v3, s4
-; GFX908-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX908-NEXT:    v_mov_b32_e32 v2, s4
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_and_b32 s4, s20, 3
 ; GFX908-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX908-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX908-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT:    v_add3_u32 v4, v4, v0, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v0, v1, s7, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX908-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX908-NEXT:    v_add3_u32 v2, v2, v1, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT:    v_and_or_b32 v4, v5, s7, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5144,37 +5127,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s20, 0x200
 ; GFX8-NEXT:    s_and_b32 s4, s20, -4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_and_b32 s4, s20, 3
 ; GFX8-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX8-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_lshrrev_b32_sdwa v1, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX8-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v1
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; GFX8-NEXT:    v_and_b32_e32 v4, s7, v5
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5186,34 +5169,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s20, 0x200
 ; GFX7-NEXT:    s_and_b32 s4, s20, -4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
-; GFX7-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX7-NEXT:    s_and_b32 s4, s20, 3
 ; GFX7-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5225,35 +5208,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s20, 0x200
 ; GFX6-NEXT:    s_and_b32 s4, s20, -4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
-; GFX6-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX6-NEXT:    s_and_b32 s4, s20, 3
 ; GFX6-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX6-NEXT:    s_lshl_b32 s4, 0xffff, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s6, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, s7, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s6, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v2, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5295,7 +5278,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2:
@@ -5306,27 +5289,26 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v4, v4, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX12-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX12-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-TRUE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5350,6 +5332,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -5390,7 +5373,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2:
@@ -5401,26 +5384,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v4, v4, v10
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX12-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX12-FAKE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX12-FAKE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5444,6 +5426,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -5476,7 +5459,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v4, v9, s[4:7], 0 offen
+; GFX942-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX942-NEXT:  ; %bb.2:
@@ -5488,7 +5471,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX942-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX942-NEXT:    v_min_f32_e32 v4, v4, v11
@@ -5522,6 +5504,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB15_3
@@ -5554,7 +5537,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2:
@@ -5566,26 +5549,25 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_min_f32_e32 v4, v4, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-TRUE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5608,6 +5590,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -5644,7 +5627,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v4, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2:
@@ -5656,25 +5639,24 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX11-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_min_f32_e32 v4, v4, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v4
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX11-FAKE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5697,6 +5679,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -5729,7 +5712,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
@@ -5740,10 +5723,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v10
 ; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v4
@@ -5773,6 +5755,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -5805,7 +5788,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v4, v9, s[8:11], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2:
@@ -5817,7 +5800,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v11
 ; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
@@ -5848,6 +5830,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
@@ -5877,7 +5860,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -5889,7 +5872,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_min_f32_e32 v4, v4, v10
 ; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
@@ -5921,6 +5903,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
@@ -5950,7 +5933,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -5961,7 +5944,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v10
 ; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
@@ -5995,6 +5977,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
@@ -6022,19 +6005,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
 ; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -6065,6 +6047,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
@@ -6093,20 +6076,18 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    buffer_load_dword v4, v8, s[8:11], 0 offen
+; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
 ; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -6137,6 +6118,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
@@ -6486,7 +6468,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v2, v0, v0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
@@ -6494,17 +6476,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v1, v5, v5
-; GFX12-NEXT:    v_pk_min_num_f16 v4, v1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6518,25 +6499,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX942-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v5, v2
-; GFX942-NEXT:    v_pk_max_f16 v2, v5, v5
+; GFX942-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_pk_min_f16 v4, v2, v0
+; GFX942-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX942-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6548,25 +6529,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v1, v5, v5
-; GFX11-NEXT:    v_pk_min_f16 v4, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -6580,24 +6560,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_max_f16 v1, v5, v5
-; GFX10-NEXT:    v_pk_min_f16 v4, v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_mov_b32_e32 v2, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
@@ -6609,23 +6589,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
-; GFX90A-NEXT:    v_pk_max_f16 v2, v5, v5
-; GFX90A-NEXT:    v_pk_min_f16 v4, v2, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6639,21 +6619,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
+; GFX908-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    v_pk_max_f16 v1, v5, v5
-; GFX908-NEXT:    v_pk_min_f16 v4, v1, v0
-; GFX908-NEXT:    v_mov_b32_e32 v1, v4
-; GFX908-NEXT:    v_mov_b32_e32 v2, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6667,25 +6647,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v1, v1
+; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v5, v5, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v6, v6
-; GFX8-NEXT:    v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v2, v2, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v2, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7951,45 +7931,43 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_add_co_i32 s4, s16, 0x400
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX12-TRUE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_dual_min_num_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v1, v1, v0
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT:    v_dual_min_num_f32 v5, v5, v2 :: v_dual_min_num_f32 v0, v0, v3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v1, v2
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8006,44 +7984,42 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX12-FAKE16-NEXT:    s_add_co_i32 s4, s16, 0x400
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-FAKE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_dual_min_num_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_min_num_f32_e32 v1, v1, v3
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT:    v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX12-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8057,40 +8033,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
-; GFX942-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:1024
+; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
 ; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX942-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX942-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v2
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX942-NEXT:    v_min_f32_e32 v2, v2, v1
-; GFX942-NEXT:    v_min_f32_e32 v3, v3, v0
-; GFX942-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX942-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX942-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX942-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX942-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX942-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX942-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX942-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX942-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX942-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX942-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX942-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8100,11 +8076,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -8112,35 +8087,34 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_dual_min_f32 v2, v2, v3 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v1, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_min_f32 v5, v5, v2 :: v_dual_min_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v1, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -8153,11 +8127,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, s16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
 ; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -8165,33 +8138,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_dual_min_f32 v2, v2, v0 :: v_dual_lshlrev_b32 v1, 16, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[1:2], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX11-FAKE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -8206,38 +8178,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v0
-; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v8, s4
-; GFX10-NEXT:    v_perm_b32 v5, v2, v1, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    v_mov_b32_e32 v2, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX10-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
@@ -8249,39 +8221,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
-; GFX90A-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
+; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v2
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v1
-; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s8
-; GFX90A-NEXT:    v_add3_u32 v8, v8, v3, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v3, v2, s9
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8295,37 +8267,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX908-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
-; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX908-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v2, v0
-; GFX908-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX908-NEXT:    v_add3_u32 v5, v5, v1, s8
-; GFX908-NEXT:    v_add3_u32 v8, v8, v2, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX908-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v2, v1, s9
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    v_mov_b32_e32 v2, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8339,38 +8311,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v0
-; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v5, v2, v1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v2, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[1:2], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index a125880fd46a2..f8e13fcdd2273 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -220,22 +220,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
@@ -246,32 +243,31 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v10
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s10, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s10, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s10, 2
 ; GFX9-O0-NEXT:    s_mov_b32 s11, s7
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s11, 3
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, s10, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v2, v1, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v2, v13, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v19, vcc
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s11, 3
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, s10, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v0, v2, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v0, v13, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v19, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -279,25 +275,25 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[9:10], s[4:5]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v19, v2, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v13, v2, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v19, v0, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v13, v0, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v20
@@ -428,18 +424,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9]
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
 ; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
 ; GFX9-O0-NEXT:    v_min_u32_e64 v5, v4, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
 ; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v1
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v3
 ; GFX9-O0-NEXT:    v_min_u32_e64 v11, v4, v10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s12
@@ -526,33 +522,33 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s4, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s5, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -560,64 +556,67 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB0_8
 ; GFX9-O0-NEXT:  .LBB0_1: ; %Flow
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-O0-NEXT:  .LBB0_3: ; %Flow2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -642,408 +641,408 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_3
 ; GFX9-O0-NEXT:  .LBB0_5: ; %Flow1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 9
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v29, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v29, 11
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 11
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[26:27]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v27
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v14, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[2:3]
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], s4, v[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v22
+; GFX9-O0-NEXT:    v_or_b32_e64 v22, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v26
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v3, v14
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v28
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[10:11]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v25
-; GFX9-O0-NEXT:    v_or3_b32 v10, v10, v11, v28
+; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v24
-; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
-; GFX9-O0-NEXT:    v_or_b32_e64 v10, v10, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v7, vcc, v7, v10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v16, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v14, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v11, vcc
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[5:6], s4, v[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s5
-; GFX9-O0-NEXT:    v_and_b32_e64 v4, v15, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v5
+; GFX9-O0-NEXT:    v_and_b32_e64 v12, v7, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
 ; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
-; GFX9-O0-NEXT:    v_and_b32_e64 v6, v17, s4
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-O0-NEXT:    v_and_b32_e64 v14, v11, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT:    v_and_b32_e64 v22, v15, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v20, v17, v20
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
 ; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT:    v_and_b32_e64 v15, v15, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v17, v17, v18
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v17
-; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v21
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v10, vcc, v10, v19
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v16, vcc, v16, v18, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v14, vcc, v14, v17, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v15, vcc
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v16
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v8
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
-; GFX9-O0-NEXT:    v_or_b32_e64 v10, v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
-; GFX9-O0-NEXT:    v_or_b32_e64 v8, v8, v9
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v18, v18, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v14
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v4
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 10
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 11
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v8, v[18:19]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v8
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[16:17]
+; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v22
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v14, v15
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v15
-; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[6:7], v8, s4
-; GFX9-O0-NEXT:    v_sub_u32_e64 v11, v8, s4
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[20:21], v11, v[16:17]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v11, v11, v22, s[6:7]
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v22
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v7
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[6:7], v4, s4
+; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s4
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[20:21], v5, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, s4
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v11, v11, v22, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v20
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v18
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v20
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v18
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[16:17], v8, v[16:17]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v16, v11, v16, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s7, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s7
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v11, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s7
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v11, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s6
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s7
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s6
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s7
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s6
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v13
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s4
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s4, 10
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s5, 11
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -1056,129 +1055,129 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v7, vcc, v3, v4
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v2, vcc
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v8
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[0:1], v2, v[11:12]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[9:10]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v15
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
-; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
 ; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v2, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 63
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s10, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v3, v[9:10]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v14
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[4:5]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 0
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v2, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v13
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], v2, v[9:10]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s9
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX9-O0-NEXT:    s_branch .LBB0_7
 ; GFX9-O0-NEXT:  .LBB0_9: ; %udiv-end
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1222,7 +1221,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s4, v[3:4]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1693,26 +1692,29 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v31, 3
 ; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-G-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-G-O0-NEXT:  .LBB0_3: ; %Flow2
@@ -1798,14 +1800,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v31, 4
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v31, 5
 ; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
@@ -2248,20 +2250,20 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX9-G-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], v[6:7]
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-G-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-G-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
@@ -2496,7 +2498,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -2553,16 +2555,16 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 1
 ; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
-; GFX9-O0-NEXT:    v_or_b32_e64 v8, v0, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v14, v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v8, v2, v0
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v14
 ; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
@@ -2607,18 +2609,18 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
 ; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
 ; GFX9-O0-NEXT:    v_min_u32_e64 v5, v4, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
 ; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v1
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v3
 ; GFX9-O0-NEXT:    v_min_u32_e64 v14, v4, v10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
@@ -2709,33 +2711,33 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s5, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2743,47 +2745,50 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_8
 ; GFX9-O0-NEXT:  .LBB1_1: ; %Flow
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_5
 ; GFX9-O0-NEXT:  .LBB1_3: ; %Flow2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 3
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -2825,29 +2830,29 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_3
 ; GFX9-O0-NEXT:  .LBB1_5: ; %Flow1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -2865,214 +2870,214 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:  .LBB1_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v29, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v29, 9
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 9
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[26:27]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v27
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v14, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[2:3]
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], s4, v[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v22
+; GFX9-O0-NEXT:    v_or_b32_e64 v22, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v26
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v3, v14
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v28
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[10:11]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v25
-; GFX9-O0-NEXT:    v_or3_b32 v10, v10, v11, v28
+; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v24
-; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
-; GFX9-O0-NEXT:    v_or_b32_e64 v10, v10, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v7, vcc, v7, v10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v16, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v14, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v11, vcc
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[5:6], s4, v[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s5
-; GFX9-O0-NEXT:    v_and_b32_e64 v4, v15, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v5
+; GFX9-O0-NEXT:    v_and_b32_e64 v12, v7, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
 ; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
-; GFX9-O0-NEXT:    v_and_b32_e64 v6, v17, s4
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-O0-NEXT:    v_and_b32_e64 v14, v11, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT:    v_and_b32_e64 v22, v15, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v20, v17, v20
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
 ; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT:    v_and_b32_e64 v15, v15, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v17, v17, v18
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v17
-; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v21
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v10, vcc, v10, v19
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v16, vcc, v16, v18, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v14, vcc, v14, v17, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v15, vcc
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v16
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v8
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
-; GFX9-O0-NEXT:    v_or_b32_e64 v10, v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
-; GFX9-O0-NEXT:    v_or_b32_e64 v8, v8, v9
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v18, v18, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v14
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v4
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 5
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
@@ -3090,128 +3095,128 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_1
 ; GFX9-O0-NEXT:  .LBB1_7: ; %udiv-preheader
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v8, v[18:19]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v8
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[16:17]
+; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v22
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v14, v15
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v15
-; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[6:7], v8, s4
-; GFX9-O0-NEXT:    v_sub_u32_e64 v11, v8, s4
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[20:21], v11, v[16:17]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v11, v11, v22, s[6:7]
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v22
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v7
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[6:7], v4, s4
+; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s4
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[20:21], v5, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, s4
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v11, v11, v22, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v20
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v18
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v20
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v18
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[16:17], v8, v[16:17]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v16, v11, v16, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s7, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s7
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v11, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s7
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v11, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s6
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s7
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s6
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s7
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s6
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v13
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s4
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s4, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s5, 9
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
@@ -3221,12 +3226,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-O0-NEXT:  .LBB1_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -3239,118 +3244,118 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v7, vcc, v3, v4
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v2, vcc
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v8
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[0:1], v2, v[11:12]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[9:10]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v15
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
-; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
 ; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v2, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 63
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s10, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v3, v[9:10]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v14
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[4:5]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 0
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v2, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v13
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], v2, v[9:10]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s9
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_5
@@ -3369,7 +3374,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -3779,26 +3784,29 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v32, 3
 ; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-G-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    s_nop 0
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_branch .LBB1_5
 ; GFX9-G-O0-NEXT:  .LBB1_3: ; %Flow2
@@ -3884,14 +3892,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s4, v32, 4
 ; GFX9-G-O0-NEXT:    v_readlane_b32 s5, v32, 5
 ; GFX9-G-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-G-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(3)
@@ -4347,20 +4355,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX9-G-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], v[6:7]
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s11
-; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_nop 0
-; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-G-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-G-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-G-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]

diff  --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index de439c6f46c6e..77b78f1f8a333 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -7,37 +7,37 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
-; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v24, 31, v3
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v25, 31, v11
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
 ; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_mov_b32_e32 v26, v24
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v25
-; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, 0, v2, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v19, vcc, 0, v2, vcc
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, v1, v17, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, v0, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v1, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v0, v16, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, 0, v3, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v2, v21, s[4:5]
-; SDAG-NEXT:    v_ffbh_u32_e32 v1, v18
-; SDAG-NEXT:    v_ffbh_u32_e32 v2, v19
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v2, v19, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v20
+; SDAG-NEXT:    v_ffbh_u32_e32 v2, v21
 ; SDAG-NEXT:    v_cndmask_b32_e64 v17, v3, v0, s[4:5]
-; SDAG-NEXT:    v_or_b32_e32 v0, v18, v16
+; SDAG-NEXT:    v_or_b32_e32 v0, v20, v16
 ; SDAG-NEXT:    v_sub_i32_e32 v3, vcc, 0, v8
-; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], 32, v1
+; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], 32, v1
 ; SDAG-NEXT:    v_ffbh_u32_e32 v22, v16
-; SDAG-NEXT:    v_or_b32_e32 v1, v19, v17
+; SDAG-NEXT:    v_or_b32_e32 v1, v21, v17
 ; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT:    v_min_u32_e32 v2, v21, v2
-; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], 32, v22
+; SDAG-NEXT:    v_min_u32_e32 v2, v19, v2
+; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], 32, v22
 ; SDAG-NEXT:    v_ffbh_u32_e32 v22, v17
 ; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v28, v9, v23, s[6:7]
 ; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, 0, v10, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v29, v8, v3, s[6:7]
-; SDAG-NEXT:    v_min_u32_e32 v1, v21, v22
+; SDAG-NEXT:    v_min_u32_e32 v1, v19, v22
 ; SDAG-NEXT:    v_add_i32_e64 v2, s[8:9], 64, v2
 ; SDAG-NEXT:    v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
 ; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, 0, v11, vcc
@@ -46,17 +46,17 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v9, v3, 0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v10, v2, v1, vcc
 ; SDAG-NEXT:    v_ffbh_u32_e32 v3, v29
-; SDAG-NEXT:    v_ffbh_u32_e32 v21, v28
+; SDAG-NEXT:    v_ffbh_u32_e32 v19, v28
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, v11, v8, s[6:7]
 ; SDAG-NEXT:    v_or_b32_e32 v2, v29, v0
 ; SDAG-NEXT:    v_add_i32_e32 v8, vcc, 32, v3
 ; SDAG-NEXT:    v_ffbh_u32_e32 v11, v0
 ; SDAG-NEXT:    v_or_b32_e32 v3, v28, v1
-; SDAG-NEXT:    v_min_u32_e32 v8, v8, v21
+; SDAG-NEXT:    v_min_u32_e32 v8, v8, v19
 ; SDAG-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT:    v_ffbh_u32_e32 v21, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v19, v1
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT:    v_min_u32_e32 v2, v11, v21
+; SDAG-NEXT:    v_min_u32_e32 v2, v11, v19
 ; SDAG-NEXT:    v_add_i32_e64 v3, s[6:7], 64, v8
 ; SDAG-NEXT:    v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7]
 ; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
@@ -66,35 +66,35 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
 ; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v8, v9, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v8, 0x7f, v2
-; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v20, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v18, vcc
 ; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3]
-; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v20, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v18, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v8, v8, v10
 ; SDAG-NEXT:    v_or_b32_e32 v9, v3, v11
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
 ; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, v20, v21, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v18, v19, s[4:5]
 ; SDAG-NEXT:    v_and_b32_e32 v8, 1, v8
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v8
 ; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v20, v17, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v17, 0, s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v22, v16, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v21, v19, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, v21, 0, s[4:5]
 ; SDAG-NEXT:    s_and_b64 s[8:9], s[6:7], vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v23, v18, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v23, v20, 0, s[4:5]
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[8:9]
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_6
 ; SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
 ; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v2
-; SDAG-NEXT:    v_sub_i32_e64 v20, s[4:5], 63, v2
+; SDAG-NEXT:    v_sub_i32_e64 v18, s[4:5], 63, v2
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v9, 0
 ; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v3, vcc
-; SDAG-NEXT:    v_lshl_b64 v[20:21], v[18:19], v20
+; SDAG-NEXT:    v_lshl_b64 v[18:19], v[20:21], v18
 ; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v10, vcc
 ; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v11, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v10, v30, v32
@@ -102,16 +102,16 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_or_b32_e32 v11, v31, v33
 ; SDAG-NEXT:    v_lshl_b64 v[2:3], v[16:17], v34
 ; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v34
-; SDAG-NEXT:    v_lshl_b64 v[22:23], v[18:19], v34
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[20:21], v34
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT:    v_lshr_b64 v[10:11], v[18:19], v35
+; SDAG-NEXT:    v_lshr_b64 v[10:11], v[20:21], v35
 ; SDAG-NEXT:    v_or_b32_e32 v3, v3, v11
 ; SDAG-NEXT:    v_or_b32_e32 v2, v2, v10
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT:    v_cndmask_b32_e64 v3, v21, v3, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, v20, v2, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, v23, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, v22, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, v23, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, v22, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v34
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, v3, v17, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v16, s[4:5]
@@ -121,7 +121,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_5
 ; SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT:    v_lshr_b64 v[8:9], v[18:19], v30
+; SDAG-NEXT:    v_lshr_b64 v[8:9], v[20:21], v30
 ; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, 64, v30
 ; SDAG-NEXT:    v_lshl_b64 v[10:11], v[16:17], v10
 ; SDAG-NEXT:    v_or_b32_e32 v11, v9, v11
@@ -131,9 +131,9 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_lshr_b64 v[8:9], v[16:17], v8
 ; SDAG-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v30
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, v9, v19, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v9, v21, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, v8, v18, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v8, v20, s[4:5]
 ; SDAG-NEXT:    v_lshr_b64 v[8:9], v[16:17], v30
 ; SDAG-NEXT:    v_cndmask_b32_e32 v23, 0, v9, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v22, 0, v8, vcc
@@ -149,30 +149,30 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_mov_b32_e32 v9, 0
 ; SDAG-NEXT:  .LBB0_3: ; %udiv-do-while3
 ; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v21
-; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT:    v_lshl_b64 v[22:23], v[22:23], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v19
+; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v19
 ; SDAG-NEXT:    v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[22:23], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v21
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v39, 31, v3
 ; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT:    v_or_b32_e32 v21, v17, v21
-; SDAG-NEXT:    v_or_b32_e32 v20, v16, v20
+; SDAG-NEXT:    v_or_b32_e32 v19, v17, v19
+; SDAG-NEXT:    v_or_b32_e32 v18, v16, v18
 ; SDAG-NEXT:    v_or_b32_e32 v16, v22, v38
-; SDAG-NEXT:    v_or_b32_e32 v17, v18, v39
+; SDAG-NEXT:    v_or_b32_e32 v17, v20, v39
 ; SDAG-NEXT:    v_or_b32_e32 v2, v2, v8
 ; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v34, v17
-; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v35, v19, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v35, v21, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v36, v16, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v37, v23, vcc
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; SDAG-NEXT:    v_and_b32_e32 v18, v8, v29
+; SDAG-NEXT:    v_and_b32_e32 v20, v8, v29
 ; SDAG-NEXT:    v_and_b32_e32 v22, v8, v28
 ; SDAG-NEXT:    v_and_b32_e32 v38, v8, v0
 ; SDAG-NEXT:    v_and_b32_e32 v39, v8, v1
 ; SDAG-NEXT:    v_and_b32_e32 v8, 1, v8
-; SDAG-NEXT:    v_sub_i32_e32 v18, vcc, v17, v18
-; SDAG-NEXT:    v_subb_u32_e32 v19, vcc, v19, v22, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v20, vcc, v17, v20
+; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, v21, v22, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v22, vcc, v16, v38, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, v23, v39, vcc
 ; SDAG-NEXT:    v_add_i32_e32 v30, vcc, -1, v30
@@ -194,11 +194,11 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:  .LBB0_5: ; %Flow14
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SDAG-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 31, v21
-; SDAG-NEXT:    v_lshl_b64 v[2:3], v[20:21], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 31, v19
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[18:19], 1
 ; SDAG-NEXT:    v_or_b32_e32 v0, v0, v16
-; SDAG-NEXT:    v_or_b32_e32 v20, v11, v1
-; SDAG-NEXT:    v_or_b32_e32 v21, v9, v3
+; SDAG-NEXT:    v_or_b32_e32 v18, v11, v1
+; SDAG-NEXT:    v_or_b32_e32 v19, v9, v3
 ; SDAG-NEXT:    v_or_b32_e32 v22, v10, v0
 ; SDAG-NEXT:    v_or_b32_e32 v23, v8, v2
 ; SDAG-NEXT:  .LBB0_6: ; %Flow16
@@ -208,110 +208,110 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT:    v_mov_b32_e32 v18, v16
-; SDAG-NEXT:    v_mov_b32_e32 v19, v17
+; SDAG-NEXT:    v_mov_b32_e32 v20, v16
+; SDAG-NEXT:    v_mov_b32_e32 v21, v17
 ; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, 0, v5, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v6, vcc
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v1, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v0, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, 0, v7, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, v9, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
 ; SDAG-NEXT:    v_ffbh_u32_e32 v1, v2
-; SDAG-NEXT:    v_ffbh_u32_e32 v6, v3
-; SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v0, s[4:5]
-; SDAG-NEXT:    v_sub_i32_e32 v7, vcc, 0, v12
-; SDAG-NEXT:    v_or_b32_e32 v0, v2, v4
-; SDAG-NEXT:    v_ffbh_u32_e32 v9, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v4, v3
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[4:5]
+; SDAG-NEXT:    v_sub_i32_e32 v5, vcc, 0, v12
+; SDAG-NEXT:    v_or_b32_e32 v0, v2, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v9, v6
 ; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], 32, v1
 ; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v13, vcc
-; SDAG-NEXT:    v_or_b32_e32 v1, v3, v5
+; SDAG-NEXT:    v_or_b32_e32 v1, v3, v7
 ; SDAG-NEXT:    v_add_i32_e64 v9, s[4:5], 32, v9
-; SDAG-NEXT:    v_ffbh_u32_e32 v30, v5
-; SDAG-NEXT:    v_min_u32_e32 v6, v10, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v30, v7
+; SDAG-NEXT:    v_min_u32_e32 v4, v10, v4
 ; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v14, vcc
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v28, v13, v11, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v29, v12, v7, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v29, v12, v5, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
 ; SDAG-NEXT:    v_min_u32_e32 v1, v9, v30
-; SDAG-NEXT:    v_add_i32_e64 v6, s[8:9], 64, v6
-; SDAG-NEXT:    v_addc_u32_e64 v7, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT:    v_add_i32_e64 v4, s[8:9], 64, v4
+; SDAG-NEXT:    v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9]
 ; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v15, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, v14, v10, s[4:5]
 ; SDAG-NEXT:    v_ffbh_u32_e32 v10, v29
 ; SDAG-NEXT:    v_ffbh_u32_e32 v11, v28
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, v7, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v13, v6, v1, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, v5, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v13, v4, v1, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, v15, v9, s[4:5]
-; SDAG-NEXT:    v_or_b32_e32 v6, v29, v0
+; SDAG-NEXT:    v_or_b32_e32 v4, v29, v0
 ; SDAG-NEXT:    v_ffbh_u32_e32 v9, v0
 ; SDAG-NEXT:    v_add_i32_e32 v10, vcc, 32, v10
-; SDAG-NEXT:    v_or_b32_e32 v7, v28, v1
+; SDAG-NEXT:    v_or_b32_e32 v5, v28, v1
 ; SDAG-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
 ; SDAG-NEXT:    v_ffbh_u32_e32 v14, v1
 ; SDAG-NEXT:    v_min_u32_e32 v10, v10, v11
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT:    v_min_u32_e32 v6, v9, v14
-; SDAG-NEXT:    v_add_i32_e64 v7, s[4:5], 64, v10
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_min_u32_e32 v4, v9, v14
+; SDAG-NEXT:    v_add_i32_e64 v5, s[4:5], 64, v10
 ; SDAG-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5]
 ; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v6, v13
-; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v9, v12, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v9, 0x7f, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v13
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v9, v12, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v9, 0x7f, v4
 ; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v8, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7]
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v8, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v8, v9, v10
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v9, v7, v11
+; SDAG-NEXT:    v_or_b32_e32 v9, v5, v11
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
 ; SDAG-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
 ; SDAG-NEXT:    v_and_b32_e32 v8, 1, v12
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v8
 ; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, v5, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, v7, 0, s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, v4, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v6, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v14, v3, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, v2, 0, s[4:5]
 ; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_12
 ; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v6
-; SDAG-NEXT:    v_sub_i32_e64 v12, s[4:5], 63, v6
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v4
+; SDAG-NEXT:    v_sub_i32_e64 v12, s[4:5], 63, v4
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v9, 0
-; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v7, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v5, vcc
 ; SDAG-NEXT:    v_lshl_b64 v[12:13], v[2:3], v12
 ; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v10, vcc
 ; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v11, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v10, v30, v32
-; SDAG-NEXT:    v_sub_i32_e32 v34, vcc, 0x7f, v6
+; SDAG-NEXT:    v_sub_i32_e32 v34, vcc, 0x7f, v4
 ; SDAG-NEXT:    v_or_b32_e32 v11, v31, v33
-; SDAG-NEXT:    v_lshl_b64 v[6:7], v[4:5], v34
+; SDAG-NEXT:    v_lshl_b64 v[4:5], v[6:7], v34
 ; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v34
 ; SDAG-NEXT:    v_lshl_b64 v[14:15], v[2:3], v34
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
 ; SDAG-NEXT:    v_lshr_b64 v[10:11], v[2:3], v35
-; SDAG-NEXT:    v_or_b32_e32 v7, v7, v11
-; SDAG-NEXT:    v_or_b32_e32 v6, v6, v10
+; SDAG-NEXT:    v_or_b32_e32 v5, v5, v11
+; SDAG-NEXT:    v_or_b32_e32 v4, v4, v10
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v15, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v14, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v34
-; SDAG-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v6, v6, v4, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
 ; SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, 0
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -321,24 +321,24 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_lshr_b64 v[8:9], v[2:3], v30
 ; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v30
 ; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v30
-; SDAG-NEXT:    v_lshr_b64 v[37:38], v[4:5], v30
+; SDAG-NEXT:    v_lshr_b64 v[37:38], v[6:7], v30
 ; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v29
 ; SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v14, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v15, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
-; SDAG-NEXT:    v_lshl_b64 v[48:49], v[4:5], v35
-; SDAG-NEXT:    v_lshr_b64 v[4:5], v[4:5], v36
+; SDAG-NEXT:    v_lshl_b64 v[48:49], v[6:7], v35
+; SDAG-NEXT:    v_lshr_b64 v[6:7], v[6:7], v36
 ; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v28, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v9, v9, v49
 ; SDAG-NEXT:    v_or_b32_e32 v8, v8, v48
 ; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, -1, v0, vcc
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, v5, v9, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, v4, v8, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, v38, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v37, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v7, v9, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v6, v8, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v38, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v37, s[4:5]
 ; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v1, vcc
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
 ; SDAG-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
@@ -346,23 +346,23 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_mov_b32_e32 v9, 0
 ; SDAG-NEXT:  .LBB0_9: ; %udiv-do-while
 ; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v3
 ; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v7
-; SDAG-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v5
+; SDAG-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v39, 31, v11
 ; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT:    v_or_b32_e32 v4, v4, v8
+; SDAG-NEXT:    v_or_b32_e32 v6, v6, v8
 ; SDAG-NEXT:    v_or_b32_e32 v2, v2, v38
-; SDAG-NEXT:    v_or_b32_e32 v6, v6, v39
-; SDAG-NEXT:    v_or_b32_e32 v7, v13, v7
+; SDAG-NEXT:    v_or_b32_e32 v4, v4, v39
+; SDAG-NEXT:    v_or_b32_e32 v5, v13, v5
 ; SDAG-NEXT:    v_or_b32_e32 v11, v15, v11
 ; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v34, v2
-; SDAG-NEXT:    v_or_b32_e32 v6, v12, v6
+; SDAG-NEXT:    v_or_b32_e32 v4, v12, v4
 ; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v35, v3, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v36, v4, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v37, v5, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v36, v6, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v37, v7, vcc
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; SDAG-NEXT:    v_and_b32_e32 v15, v8, v29
 ; SDAG-NEXT:    v_and_b32_e32 v38, v8, v28
@@ -370,8 +370,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_and_b32_e32 v48, v8, v1
 ; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v15
 ; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v38, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v4, v39, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v48, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v6, v39, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v48, vcc
 ; SDAG-NEXT:    v_add_i32_e32 v30, vcc, -1, v30
 ; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, -1, v31, vcc
 ; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, -1, v32, vcc
@@ -390,7 +390,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:  .LBB0_11: ; %Flow11
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT:    v_lshl_b64 v[0:1], v[6:7], 1
+; SDAG-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v4, 31, v11
 ; SDAG-NEXT:    v_lshl_b64 v[2:3], v[10:11], 1
 ; SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
@@ -402,11 +402,11 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; SDAG-NEXT:    v_xor_b32_e32 v3, v27, v26
 ; SDAG-NEXT:    v_xor_b32_e32 v2, v25, v24
-; SDAG-NEXT:    v_xor_b32_e32 v7, v19, v18
+; SDAG-NEXT:    v_xor_b32_e32 v7, v21, v20
 ; SDAG-NEXT:    v_xor_b32_e32 v6, v17, v16
-; SDAG-NEXT:    v_xor_b32_e32 v4, v20, v3
+; SDAG-NEXT:    v_xor_b32_e32 v4, v18, v3
 ; SDAG-NEXT:    v_xor_b32_e32 v5, v22, v2
-; SDAG-NEXT:    v_xor_b32_e32 v1, v21, v3
+; SDAG-NEXT:    v_xor_b32_e32 v1, v19, v3
 ; SDAG-NEXT:    v_xor_b32_e32 v0, v23, v2
 ; SDAG-NEXT:    v_xor_b32_e32 v10, v13, v7
 ; SDAG-NEXT:    v_xor_b32_e32 v9, v9, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index 279c34722e272..827cb4ac2589a 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -30,8 +30,8 @@ define amdgpu_ps void @main(i32 %0, float %1) {
 ; ISA-NEXT:  .LBB0_1: ; %Flow1
 ; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; ISA-NEXT:    s_or_b64 exec, exec, s[4:5]
-; ISA-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; ISA-NEXT:    s_mov_b64 s[8:9], 0
+; ISA-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; ISA-NEXT:  .LBB0_2: ; %Flow
 ; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; ISA-NEXT:    s_and_b64 s[6:7], exec, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index e3bd4e7383598..883063b5471ca 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -5407,50 +5407,54 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v0
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execz .LBB24_4
-; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_3
+; GFX90A-NEXT:  ; %bb.1: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_6
+; GFX90A-NEXT:  .LBB24_2: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB24_3: ; %atomicrmw.global
+; GFX90A-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:  .LBB24_2: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB24_4: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[4:5]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX90A-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_2
-; GFX90A-NEXT:  ; %bb.3: ; %Flow
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_4
+; GFX90A-NEXT:  ; %bb.5: ; %Flow
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX90A-NEXT:  .LBB24_4: ; %Flow2
+; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_cbranch_execz .LBB24_6
-; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX90A-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    s_cbranch_execz .LBB24_2
+; GFX90A-NEXT:  .LBB24_6: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT:  .LBB24_6: ; %atomicrmw.phi
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX90A-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 25d59a26189c9..c603421ca15b4 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -5407,50 +5407,54 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v0
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execz .LBB24_4
-; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_3
+; GFX90A-NEXT:  ; %bb.1: ; %Flow2
+; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_6
+; GFX90A-NEXT:  .LBB24_2: ; %atomicrmw.phi
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+; GFX90A-NEXT:  .LBB24_3: ; %atomicrmw.global
+; GFX90A-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:  .LBB24_2: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB24_4: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
-; GFX90A-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX90A-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_2
-; GFX90A-NEXT:  ; %bb.3: ; %Flow
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_4
+; GFX90A-NEXT:  ; %bb.5: ; %Flow
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX90A-NEXT:  .LBB24_4: ; %Flow2
+; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_cbranch_execz .LBB24_6
-; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX90A-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    s_cbranch_execz .LBB24_2
+; GFX90A-NEXT:  .LBB24_6: ; %atomicrmw.private
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT:  .LBB24_6: ; %atomicrmw.phi
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX90A-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX90A-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index afa57b8692aa5..100a560c1d127 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -711,20 +711,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -738,21 +738,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -892,20 +892,20 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -919,21 +919,21 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1074,7 +1074,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -1083,15 +1083,15 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1105,7 +1105,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
@@ -1114,16 +1114,16 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1457,20 +1457,20 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1484,21 +1484,21 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2059,20 +2059,20 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2086,21 +2086,21 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2818,20 +2818,20 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2845,21 +2845,21 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3000,20 +3000,20 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3027,21 +3027,21 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3156,20 +3156,20 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3183,21 +3183,21 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3352,20 +3352,20 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3379,21 +3379,21 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3747,20 +3747,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3774,21 +3774,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4141,20 +4141,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4168,21 +4168,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4505,20 +4505,20 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4532,21 +4532,21 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5189,20 +5189,20 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5216,21 +5216,21 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5344,20 +5344,20 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5371,21 +5371,21 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5500,7 +5500,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -5509,15 +5509,15 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5531,7 +5531,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
@@ -5540,16 +5540,16 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f
 ; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5843,20 +5843,20 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5870,21 +5870,21 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6172,20 +6172,20 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6199,21 +6199,21 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6567,20 +6567,20 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6594,21 +6594,21 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6891,20 +6891,20 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6918,21 +6918,21 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7769,23 +7769,23 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v7
+; GFX7-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7-NEXT:    v_mov_b32_e32 v6, v8
-; GFX7-NEXT:    v_mov_b32_e32 v7, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7799,24 +7799,24 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v6
-; GFX6-NEXT:    v_mov_b32_e32 v5, v7
-; GFX6-NEXT:    v_mov_b32_e32 v6, v8
-; GFX6-NEXT:    v_mov_b32_e32 v7, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7975,23 +7975,23 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v7
+; GFX7-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7-NEXT:    v_mov_b32_e32 v6, v8
-; GFX7-NEXT:    v_mov_b32_e32 v7, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8005,24 +8005,24 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v6
-; GFX6-NEXT:    v_mov_b32_e32 v5, v7
-; GFX6-NEXT:    v_mov_b32_e32 v6, v8
-; GFX6-NEXT:    v_mov_b32_e32 v7, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8182,7 +8182,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -8191,18 +8191,18 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v7
+; GFX7-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7-NEXT:    v_mov_b32_e32 v6, v8
-; GFX7-NEXT:    v_mov_b32_e32 v7, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8216,7 +8216,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
@@ -8225,19 +8225,19 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v6
-; GFX6-NEXT:    v_mov_b32_e32 v5, v7
-; GFX6-NEXT:    v_mov_b32_e32 v6, v8
-; GFX6-NEXT:    v_mov_b32_e32 v7, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8597,36 +8597,36 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
+; GFX7-NEXT:    v_not_b32_e32 v7, v2
 ; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8640,37 +8640,36 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
+; GFX6-NEXT:    v_not_b32_e32 v7, v2
 ; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -9033,36 +9032,36 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9077,37 +9076,37 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -9472,36 +9471,36 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9516,37 +9515,37 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -9889,28 +9888,28 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
 ; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9930,29 +9929,29 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 3, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
 ; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10305,31 +10304,31 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10347,32 +10346,32 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10727,31 +10726,31 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10769,32 +10768,32 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11067,28 +11066,28 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -11101,29 +11100,29 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -11380,23 +11379,23 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11413,24 +11412,24 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11801,36 +11800,36 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -11845,37 +11844,37 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -12230,31 +12229,31 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT:    v_mov_b32_e32 v8, v3
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12272,32 +12271,32 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12732,35 +12731,35 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12775,36 +12774,35 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -13248,35 +13246,35 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -13292,36 +13290,36 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -13767,35 +13765,35 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -13811,36 +13809,36 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -14255,30 +14253,30 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14296,31 +14294,31 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14754,30 +14752,30 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14796,31 +14794,31 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15256,30 +15254,30 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15298,31 +15296,31 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -15680,28 +15678,28 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -15714,29 +15712,29 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -16078,23 +16076,23 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16111,24 +16109,24 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -16580,35 +16578,35 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB62_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB62_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16624,36 +16622,36 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB62_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB62_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -17089,30 +17087,30 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB63_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB63_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -17131,31 +17129,31 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB63_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB63_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index b9774808f1ad1..faa3ee61427a2 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -4806,36 +4806,36 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
+; GFX7-NEXT:    v_not_b32_e32 v7, v2
 ; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4849,37 +4849,36 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
+; GFX6-NEXT:    v_not_b32_e32 v7, v2
 ; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -5265,36 +5264,36 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5309,37 +5308,37 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -5727,36 +5726,36 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5771,37 +5770,37 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -6166,28 +6165,28 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
 ; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6207,29 +6206,29 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 3, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
 ; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_max_f32_e32 v4, v4, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6609,31 +6608,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6651,32 +6650,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7058,31 +7057,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7100,32 +7099,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7416,28 +7415,28 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -7450,29 +7449,29 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -7751,23 +7750,23 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7784,24 +7783,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8195,36 +8194,36 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8239,37 +8238,37 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -8651,31 +8650,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8693,32 +8692,32 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9153,36 +9152,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9197,37 +9196,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT:    v_max_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -9671,36 +9669,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9716,37 +9714,37 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -10192,36 +10190,36 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10237,37 +10235,37 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -10682,31 +10680,31 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10724,32 +10722,32 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT:    v_max_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11183,31 +11181,31 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11226,32 +11224,32 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11687,31 +11685,31 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11730,32 +11728,32 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12113,29 +12111,29 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12148,30 +12146,30 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v6
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -12513,24 +12511,24 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12547,25 +12545,25 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13017,36 +13015,36 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -13062,37 +13060,37 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -13528,31 +13526,31 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13571,32 +13569,32 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index c30543642d314..cb66f85ff3ae2 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -4806,36 +4806,36 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
+; GFX7-NEXT:    v_not_b32_e32 v7, v2
 ; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4849,37 +4849,36 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
+; GFX6-NEXT:    v_not_b32_e32 v7, v2
 ; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -5265,36 +5264,36 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5309,37 +5308,37 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -5727,36 +5726,36 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5771,37 +5770,37 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -6166,28 +6165,28 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
 ; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6207,29 +6206,29 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 3, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
 ; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_min_f32_e32 v4, v4, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6609,31 +6608,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6651,32 +6650,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7058,31 +7057,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7100,32 +7099,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7416,28 +7415,28 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -7450,29 +7449,29 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -7751,23 +7750,23 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7784,24 +7783,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8195,36 +8194,36 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8239,37 +8238,37 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -8651,31 +8650,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8693,32 +8692,32 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9153,36 +9152,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9197,37 +9196,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT:    v_min_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -9671,36 +9669,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9716,37 +9714,37 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -10192,36 +10190,36 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10237,37 +10235,37 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -10682,31 +10680,31 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10724,32 +10722,32 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT:    v_min_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11183,31 +11181,31 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11226,32 +11224,32 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11687,31 +11685,31 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11730,32 +11728,32 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12113,29 +12111,29 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12148,30 +12146,30 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v6
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -12513,24 +12511,24 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12547,25 +12545,25 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13017,36 +13015,36 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -13062,37 +13060,37 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -13528,31 +13526,31 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13571,32 +13569,32 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 5e4a5c649bb24..f869b5778bfb2 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -894,20 +894,20 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -921,21 +921,21 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_sub_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1117,20 +1117,20 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1144,21 +1144,21 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_sub_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1341,7 +1341,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -1350,15 +1350,15 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1372,7 +1372,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
@@ -1381,16 +1381,16 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_sub_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1813,20 +1813,20 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1840,21 +1840,21 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_sub_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2749,20 +2749,20 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2776,21 +2776,21 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_sub_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2972,20 +2972,20 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2999,21 +2999,21 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_sub_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3196,7 +3196,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -3205,15 +3205,15 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3227,7 +3227,7 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
@@ -3236,16 +3236,16 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_sub_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3668,20 +3668,20 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3695,21 +3695,21 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_sub_f32_e32 v3, v4, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_sub_f32_e32 v4, v5, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    v_mov_b32_e32 v6, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4663,23 +4663,23 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v7
+; GFX7-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7-NEXT:    v_mov_b32_e32 v6, v8
-; GFX7-NEXT:    v_mov_b32_e32 v7, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4693,24 +4693,24 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v6
-; GFX6-NEXT:    v_mov_b32_e32 v5, v7
-; GFX6-NEXT:    v_mov_b32_e32 v6, v8
-; GFX6-NEXT:    v_mov_b32_e32 v7, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4895,23 +4895,23 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v7
+; GFX7-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7-NEXT:    v_mov_b32_e32 v6, v8
-; GFX7-NEXT:    v_mov_b32_e32 v7, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4925,24 +4925,24 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v6
-; GFX6-NEXT:    v_mov_b32_e32 v5, v7
-; GFX6-NEXT:    v_mov_b32_e32 v6, v8
-; GFX6-NEXT:    v_mov_b32_e32 v7, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5128,7 +5128,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -5137,18 +5137,18 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v7
+; GFX7-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7-NEXT:    v_mov_b32_e32 v6, v8
-; GFX7-NEXT:    v_mov_b32_e32 v7, v9
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5162,7 +5162,7 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
@@ -5171,19 +5171,19 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_add_f64 v[4:5], v[6:7], -v[2:3]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v11, v7
+; GFX6-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], -v[2:3]
-; GFX6-NEXT:    v_mov_b32_e32 v4, v6
-; GFX6-NEXT:    v_mov_b32_e32 v5, v7
-; GFX6-NEXT:    v_mov_b32_e32 v6, v8
-; GFX6-NEXT:    v_mov_b32_e32 v7, v9
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v6, v8
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5543,36 +5543,36 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
+; GFX7-NEXT:    v_not_b32_e32 v7, v2
 ; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_sub_f32_e32 v4, v4, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5586,37 +5586,36 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
+; GFX6-NEXT:    v_not_b32_e32 v7, v2
 ; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_sub_f32_e32 v4, v4, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v7
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -5979,36 +5978,36 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6023,37 +6022,37 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -6418,36 +6417,36 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6462,37 +6461,37 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -6835,28 +6834,28 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
 ; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_sub_f32_e32 v4, v4, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6876,29 +6875,29 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 3, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
 ; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_sub_f32_e32 v4, v4, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v2, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7251,31 +7250,31 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7293,32 +7292,32 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7673,31 +7672,31 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7715,32 +7714,32 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8013,28 +8012,28 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
@@ -8047,29 +8046,29 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -8326,23 +8325,23 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8359,24 +8358,24 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8747,36 +8746,36 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT:    v_not_b32_e32 v8, v2
 ; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8791,37 +8790,37 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT:    v_not_b32_e32 v8, v2
 ; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -9176,31 +9175,31 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT:    v_not_b32_e32 v6, v2
 ; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9218,32 +9217,32 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT:    v_not_b32_e32 v6, v2
 ; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v8
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v5
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9678,35 +9677,35 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_sub_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9721,36 +9720,35 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_sub_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v7
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v3, v4, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v2, v3
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -10194,35 +10192,35 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10238,36 +10236,36 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -10713,35 +10711,35 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10757,36 +10755,36 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -11201,30 +11199,30 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v5
+; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_sub_f32_e32 v4, v4, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX7-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v7
-; GFX7-NEXT:    v_mov_b32_e32 v5, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11242,31 +11240,31 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v5
+; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v3, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_sub_f32_e32 v4, v4, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v5, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v7
-; GFX6-NEXT:    v_mov_b32_e32 v5, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT:    v_mov_b32_e32 v8, v4
+; GFX6-NEXT:    v_mov_b32_e32 v7, v3
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11700,30 +11698,30 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11742,31 +11740,31 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12202,30 +12200,30 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12244,31 +12242,31 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12626,28 +12624,28 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -12660,29 +12658,29 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v6
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -13024,23 +13022,23 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_mov_b32_e32 v6, v3
+; GFX7-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13057,24 +13055,24 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT:    v_mov_b32_e32 v6, v3
+; GFX6-NEXT:    v_mov_b32_e32 v5, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -13526,35 +13524,35 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -13570,36 +13568,36 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v4, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -14035,30 +14033,30 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_not_b32_e32 v6, v4
+; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX7-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_mov_b32_e32 v7, v2
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14077,31 +14075,31 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_not_b32_e32 v6, v4
+; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, v8, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT:    v_or_b32_e32 v7, v4, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, v7
-; GFX6-NEXT:    v_mov_b32_e32 v4, v8
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT:    v_mov_b32_e32 v8, v3
+; GFX6-NEXT:    v_mov_b32_e32 v7, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v8
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index b9a37c4b98a80..f7882e6f12022 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -2167,22 +2167,22 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_and_b32_e32 v3, v4, v2
+; SI-NEXT:    v_not_b32_e32 v3, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, v5, v2
-; SI-NEXT:    v_not_b32_e32 v4, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB51_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2245,22 +2245,22 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_and_b32_e32 v3, v4, v2
+; SI-NEXT:    v_not_b32_e32 v3, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, v5, v2
-; SI-NEXT:    v_not_b32_e32 v4, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB52_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2487,38 +2487,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s6, 0
-; SI-NEXT:    v_writelane_b32 v3, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_and_b32_e32 v0, s34, v1
+; SI-NEXT:    v_not_b32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, s34, v2
-; SI-NEXT:    v_not_b32_e32 v1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB55_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v3, 1
-; SI-NEXT:    v_readlane_b32 s6, v3, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2578,38 +2578,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s6, 0
-; SI-NEXT:    v_writelane_b32 v3, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_and_b32_e32 v0, s34, v1
+; SI-NEXT:    v_not_b32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, s34, v2
-; SI-NEXT:    v_not_b32_e32 v1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB56_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v3, 1
-; SI-NEXT:    v_readlane_b32 s6, v3, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2862,22 +2862,22 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_and_b32_e32 v3, v4, v2
+; SI-NEXT:    v_not_b32_e32 v3, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, v5, v2
-; SI-NEXT:    v_not_b32_e32 v4, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB59_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3950,21 +3950,21 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_max_i32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB83_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4025,21 +4025,21 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_max_i32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB84_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4258,37 +4258,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s6, 0
-; SI-NEXT:    v_writelane_b32 v3, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v1, s34, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB87_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v3, 1
-; SI-NEXT:    v_readlane_b32 s6, v3, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4346,37 +4346,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s6, 0
-; SI-NEXT:    v_writelane_b32 v3, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v1, s34, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB88_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v3, 1
-; SI-NEXT:    v_readlane_b32 s6, v3, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4627,20 +4627,20 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_i32_e32 v0, s2, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v1, s2, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB91_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4726,20 +4726,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB92_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_i32_e32 v0, s8, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v1, s8, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB92_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4748,7 +4748,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_max_i32_ret_addr64_offset:
@@ -4840,20 +4840,20 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_i32_e32 v0, s2, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v1, s2, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB93_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4936,20 +4936,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB94_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_i32_e32 v0, s8, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v1, s8, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB94_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4958,7 +4958,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_max_i32_ret_addr64:
@@ -5041,21 +5041,21 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB95_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_max_i32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB95_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5203,21 +5203,21 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB97_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_max_u32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB97_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5278,21 +5278,21 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_max_u32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB98_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5511,37 +5511,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s6, 0
-; SI-NEXT:    v_writelane_b32 v3, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v1, s34, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB101_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v3, 1
-; SI-NEXT:    v_readlane_b32 s6, v3, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5599,37 +5599,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s6, 0
-; SI-NEXT:    v_writelane_b32 v3, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v1, s34, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB102_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v3, 1
-; SI-NEXT:    v_readlane_b32 s6, v3, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5880,20 +5880,20 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB105_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_u32_e32 v0, s2, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v1, s2, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB105_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5979,20 +5979,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB106_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_u32_e32 v0, s8, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v1, s8, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB106_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6001,7 +6001,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
@@ -6094,20 +6094,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB107_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_max_u32_e32 v0, s8, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v1, s8, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB107_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6116,7 +6116,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_umax_i32_ret_addr64:
@@ -6199,21 +6199,21 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB108_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_max_u32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB108_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6361,21 +6361,21 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB110_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_min_u32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_u32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB110_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6436,21 +6436,21 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_min_u32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_u32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB111_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6669,37 +6669,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s6, 0
-; SI-NEXT:    v_writelane_b32 v3, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_min_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_u32_e32 v1, s34, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB114_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v3, 1
-; SI-NEXT:    v_readlane_b32 s6, v3, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6757,37 +6757,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s6, 0
-; SI-NEXT:    v_writelane_b32 v3, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB115_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_min_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_u32_e32 v1, s34, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB115_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v3, 1
-; SI-NEXT:    v_readlane_b32 s6, v3, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7032,21 +7032,21 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB118_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_min_u32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_u32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB118_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7194,21 +7194,21 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB120_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_min_i32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB120_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7269,21 +7269,21 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_min_i32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB121_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7502,37 +7502,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s6, 0
-; SI-NEXT:    v_writelane_b32 v3, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_min_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v1, s34, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB124_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v3, 1
-; SI-NEXT:    v_readlane_b32 s6, v3, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7590,37 +7590,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v3, s6, 0
-; SI-NEXT:    v_writelane_b32 v3, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_min_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v1, s34, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB125_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v3, 1
-; SI-NEXT:    v_readlane_b32 s6, v3, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7871,20 +7871,20 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_min_i32_e32 v0, s2, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v1, s2, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB128_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7970,20 +7970,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB129_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_min_i32_e32 v0, s8, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v1, s8, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB129_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7992,7 +7992,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_min_i32_ret_addr64_offset:
@@ -8080,20 +8080,20 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:  .LBB130_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_min_i32_e32 v0, s6, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v1, s6, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execnz .LBB130_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8167,20 +8167,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB131_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_min_i32_e32 v0, s8, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v1, s8, v2
-; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB131_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8189,7 +8189,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_min_i32_ret_addr64:
@@ -8272,21 +8272,21 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB132_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_min_i32_e32 v3, v4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v4, v5, v2
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v5
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v4, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB132_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 9845064604bc1..59a99a6a0328d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -2205,27 +2205,27 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v4, v7, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v8, v6, v2
+; SI-NEXT:    v_not_b32_e32 v5, v4
+; SI-NEXT:    v_not_b32_e32 v4, v8
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, v9, v3
-; SI-NEXT:    v_and_b32_e32 v5, v8, v2
-; SI-NEXT:    v_not_b32_e32 v7, v4
-; SI-NEXT:    v_not_b32_e32 v6, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB50_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2294,27 +2294,27 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v4, v7, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v8, v6, v2
+; SI-NEXT:    v_not_b32_e32 v5, v4
+; SI-NEXT:    v_not_b32_e32 v4, v8
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, v9, v3
-; SI-NEXT:    v_and_b32_e32 v5, v8, v2
-; SI-NEXT:    v_not_b32_e32 v7, v4
-; SI-NEXT:    v_not_b32_e32 v6, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB51_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2578,44 +2578,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v6, s6, 0
-; SI-NEXT:    v_writelane_b32 v6, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, s34, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v4, s35, v2
+; SI-NEXT:    v_not_b32_e32 v1, v0
+; SI-NEXT:    v_not_b32_e32 v0, v4
+; SI-NEXT:    v_mov_b32_e32 v7, v3
+; SI-NEXT:    v_mov_b32_e32 v6, v2
 ; SI-NEXT:    v_mov_b32_e32 v5, v1
 ; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, s34, v5
-; SI-NEXT:    v_and_b32_e32 v1, s35, v4
-; SI-NEXT:    v_not_b32_e32 v3, v0
-; SI-NEXT:    v_not_b32_e32 v2, v1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB54_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v6, 1
-; SI-NEXT:    v_readlane_b32 s6, v6, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2683,44 +2683,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v6, s6, 0
-; SI-NEXT:    v_writelane_b32 v6, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, s34, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v4, s35, v2
+; SI-NEXT:    v_not_b32_e32 v1, v0
+; SI-NEXT:    v_not_b32_e32 v0, v4
+; SI-NEXT:    v_mov_b32_e32 v7, v3
+; SI-NEXT:    v_mov_b32_e32 v6, v2
 ; SI-NEXT:    v_mov_b32_e32 v5, v1
 ; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, s34, v5
-; SI-NEXT:    v_and_b32_e32 v1, s35, v4
-; SI-NEXT:    v_not_b32_e32 v3, v0
-; SI-NEXT:    v_not_b32_e32 v2, v1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB55_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v6, 1
-; SI-NEXT:    v_readlane_b32 s6, v6, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3003,27 +3003,27 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v4, v7, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v8, v6, v2
+; SI-NEXT:    v_not_b32_e32 v5, v4
+; SI-NEXT:    v_not_b32_e32 v4, v8
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, v9, v3
-; SI-NEXT:    v_and_b32_e32 v5, v8, v2
-; SI-NEXT:    v_not_b32_e32 v7, v4
-; SI-NEXT:    v_not_b32_e32 v6, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB58_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4073,26 +4073,26 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB80_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB80_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4159,26 +4159,26 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB81_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB81_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4443,28 +4443,28 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s35
 ; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB84_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB84_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4551,28 +4551,28 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s35
 ; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB85_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB85_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4878,26 +4878,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
 ; SI-NEXT:    v_mov_b32_e32 v4, s3
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB88_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB88_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4989,29 +4989,29 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    v_mov_b32_e32 v5, s4
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB89_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v7, v3
+; SI-NEXT:    v_mov_b32_e32 v6, v2
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB89_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5020,7 +5020,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_max_i64_ret_addr64_offset:
@@ -5119,26 +5119,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; SI-NEXT:    v_mov_b32_e32 v4, s3
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB90_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB90_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5227,29 +5227,29 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    v_mov_b32_e32 v5, s4
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v7, v3
+; SI-NEXT:    v_mov_b32_e32 v6, v2
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB91_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5258,7 +5258,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_max_i64_ret_addr64:
@@ -5347,26 +5347,26 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB92_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB92_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5535,26 +5535,26 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB94_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB94_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5621,26 +5621,26 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB95_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB95_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5905,28 +5905,28 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s35
 ; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB98_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB98_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6013,28 +6013,28 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s35
 ; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB99_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6340,26 +6340,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
 ; SI-NEXT:    v_mov_b32_e32 v4, s3
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB102_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6451,29 +6451,29 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    v_mov_b32_e32 v5, s4
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB103_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v7, v3
+; SI-NEXT:    v_mov_b32_e32 v6, v2
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB103_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6482,7 +6482,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
@@ -6577,29 +6577,29 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    v_mov_b32_e32 v5, s4
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB104_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v7, v3
+; SI-NEXT:    v_mov_b32_e32 v6, v2
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB104_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6608,7 +6608,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_umax_i64_ret_addr64:
@@ -6697,26 +6697,26 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB105_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB105_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6885,26 +6885,26 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB107_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB107_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6971,26 +6971,26 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB108_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB108_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7255,28 +7255,28 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s35
 ; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB111_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB111_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7363,28 +7363,28 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s35
 ; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB112_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB112_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7683,26 +7683,26 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB115_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB115_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7871,26 +7871,26 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB117_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB117_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7957,26 +7957,26 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB118_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB118_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8241,28 +8241,28 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s35
 ; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB121_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB121_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8349,28 +8349,28 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s35
 ; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB122_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB122_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8676,26 +8676,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
 ; SI-NEXT:    v_mov_b32_e32 v4, s3
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s9
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:  .LBB125_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB125_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8787,29 +8787,29 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    v_mov_b32_e32 v5, s4
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB126_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v7, v3
+; SI-NEXT:    v_mov_b32_e32 v6, v2
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB126_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8818,7 +8818,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_min_i64_ret_addr64_offset:
@@ -8913,28 +8913,28 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    v_mov_b32_e32 v4, s3
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    v_mov_b32_e32 v3, s5
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v8, v2
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
+; SI-NEXT:    v_mov_b32_e32 v3, v7
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB127_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9014,29 +9014,29 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    v_mov_b32_e32 v5, s4
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:  .LBB128_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v9, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v7, v3
+; SI-NEXT:    v_mov_b32_e32 v6, v2
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB128_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9045,7 +9045,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_min_i64_ret_addr64:
@@ -9134,26 +9134,26 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB129_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v11, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
 ; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v7, v9
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB129_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index d17498a933be3..82c58394c03bb 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -34,19 +34,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX7LESS-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
 ; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_2
 ; GFX7LESS-NEXT:  .LBB0_3:
@@ -205,19 +205,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
 ; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB0_2
 ; GFX7LESS-DPP-NEXT:  .LBB0_3:
@@ -416,20 +416,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB1_4
 ; GFX7LESS-NEXT:  .LBB1_5:
@@ -762,20 +762,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v2, v3, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1163,19 +1163,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX7LESS-NEXT:  .LBB2_3:
@@ -1383,19 +1383,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX7LESS-DPP-NEXT:  .LBB2_3:
@@ -1634,20 +1634,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB3_4
 ; GFX7LESS-NEXT:  .LBB3_5:
@@ -1980,20 +1980,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v2, v3, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2381,19 +2381,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX7LESS-NEXT:  .LBB4_3:
@@ -2631,19 +2631,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX7LESS-DPP-NEXT:  .LBB4_3:
@@ -2912,20 +2912,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB5_4
 ; GFX7LESS-NEXT:  .LBB5_5:
@@ -3258,20 +3258,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v2, v3, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3686,20 +3686,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB6_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB6_4
 ; GFX7LESS-NEXT:  .LBB6_5:
@@ -4032,20 +4032,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v2, v3, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4433,19 +4433,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX7LESS-NEXT:  .LBB7_3:
@@ -4683,19 +4683,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ; GFX7LESS-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX7LESS-DPP-NEXT:  .LBB7_3:
@@ -4963,20 +4963,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB8_4
 ; GFX7LESS-NEXT:  .LBB8_5:
@@ -5335,20 +5335,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v2, v3, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8004,23 +8004,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX7LESS-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
 ; GFX7LESS-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX7LESS-NEXT:  .LBB11_3:
@@ -8261,23 +8261,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-DPP-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7LESS-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX7LESS-DPP-NEXT:  .LBB11_3:
@@ -8551,23 +8551,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB12_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX7LESS-NEXT:  .LBB12_5:
@@ -8941,23 +8941,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f64 v[4:5], v[6:7], v[0:1]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9437,23 +9437,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
 ; GFX7LESS-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX7LESS-NEXT:  .LBB13_3:
@@ -9694,23 +9694,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-DPP-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7LESS-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX7LESS-DPP-NEXT:  .LBB13_3:
@@ -9984,23 +9984,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB14_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB14_4
 ; GFX7LESS-NEXT:  .LBB14_5:
@@ -10374,23 +10374,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f64 v[4:5], v[6:7], v[0:1]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10899,23 +10899,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB15_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX7LESS-NEXT:  .LBB15_5:
@@ -11289,23 +11289,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f64 v[4:5], v[6:7], v[0:1]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -14051,19 +14051,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
 ; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB18_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB18_2
 ; GFX7LESS-NEXT:  .LBB18_3:
@@ -14222,19 +14222,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX7LESS-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
 ; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB18_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB18_2
 ; GFX7LESS-DPP-NEXT:  .LBB18_3:
@@ -14397,19 +14397,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
 ; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB19_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB19_2
 ; GFX7LESS-NEXT:  .LBB19_3:
@@ -14568,19 +14568,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX7LESS-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
 ; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB19_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v3, v4, v2
+; GFX7LESS-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB19_2
 ; GFX7LESS-DPP-NEXT:  .LBB19_3:

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index cc2f490cc1ff0..f8f911b693e09 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -30,20 +30,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_2
 ; GFX7LESS-NEXT:  .LBB0_3:
@@ -155,20 +155,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB0_2
 ; GFX7LESS-DPP-NEXT:  .LBB0_3:
@@ -301,18 +301,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX7LESS-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX7LESS-NEXT:    v_max_f32_e32 v2, v1, v2
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -326,22 +326,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX7LESS-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB1_4
 ; GFX7LESS-NEXT:  .LBB1_5:
@@ -666,20 +666,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 1.0, v0
 ; GFX7LESS-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v1, 1.0, v3
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v2, v1, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1068,20 +1068,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX7LESS-NEXT:  .LBB2_3:
@@ -1193,20 +1193,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX7LESS-DPP-NEXT:  .LBB2_3:
@@ -1340,18 +1340,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX7LESS-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX7LESS-NEXT:    v_max_f32_e32 v2, v1, v2
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB3_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -1365,22 +1365,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX7LESS-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB3_4
 ; GFX7LESS-NEXT:  .LBB3_5:
@@ -1705,20 +1705,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 1.0, v0
 ; GFX7LESS-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v1, 1.0, v3
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v2, v1, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2108,20 +2108,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX7LESS-NEXT:  .LBB4_3:
@@ -2233,20 +2233,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX7LESS-DPP-NEXT:  .LBB4_3:
@@ -2379,18 +2379,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX7LESS-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX7LESS-NEXT:    v_max_f32_e32 v2, v1, v2
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB5_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -2404,22 +2404,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX7LESS-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB5_4
 ; GFX7LESS-NEXT:  .LBB5_5:
@@ -2744,20 +2744,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 1.0, v0
 ; GFX7LESS-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v1, 1.0, v3
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v2, v1, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4275,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4300,26 +4300,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
@@ -4331,16 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
@@ -5424,24 +5424,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], 4.0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB8_2
 ; GFX7LESS-NEXT:  .LBB8_3:
@@ -5590,24 +5590,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], 4.0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB8_2
 ; GFX7LESS-DPP-NEXT:  .LBB8_3:
@@ -5777,12 +5777,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX7LESS-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
@@ -5790,7 +5790,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB9_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -5804,25 +5804,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX7LESS-NEXT:  .LBB9_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7LESS-NEXT:    v_max_f64 v[6:7], v[0:1], v[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB9_4
 ; GFX7LESS-NEXT:  .LBB9_5:
@@ -6191,23 +6191,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[6:7], v[6:7]
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[2:3], v[0:1]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7821,20 +7821,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7846,26 +7846,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
@@ -7877,16 +7877,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
@@ -8970,20 +8970,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB12_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB12_2
 ; GFX7LESS-NEXT:  .LBB12_3:
@@ -9095,20 +9095,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB12_2
 ; GFX7LESS-DPP-NEXT:  .LBB12_3:
@@ -9224,20 +9224,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX7LESS-NEXT:  .LBB13_3:
@@ -9349,20 +9349,20 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX7LESS-DPP-NEXT:  .LBB13_3:

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index b3d81b5e9aec8..1f76a476107a3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -30,20 +30,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_2
 ; GFX7LESS-NEXT:  .LBB0_3:
@@ -155,20 +155,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB0_2
 ; GFX7LESS-DPP-NEXT:  .LBB0_3:
@@ -301,18 +301,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX7LESS-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX7LESS-NEXT:    v_min_f32_e32 v2, v1, v2
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -326,22 +326,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX7LESS-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB1_4
 ; GFX7LESS-NEXT:  .LBB1_5:
@@ -666,20 +666,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 1.0, v0
 ; GFX7LESS-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v1, 1.0, v3
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v2, v1, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1068,20 +1068,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX7LESS-NEXT:  .LBB2_3:
@@ -1193,20 +1193,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX7LESS-DPP-NEXT:  .LBB2_3:
@@ -1340,18 +1340,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX7LESS-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX7LESS-NEXT:    v_min_f32_e32 v2, v1, v2
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB3_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -1365,22 +1365,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX7LESS-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB3_4
 ; GFX7LESS-NEXT:  .LBB3_5:
@@ -1705,20 +1705,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 1.0, v0
 ; GFX7LESS-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v1, 1.0, v3
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v2, v1, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2108,20 +2108,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX7LESS-NEXT:  .LBB4_3:
@@ -2233,20 +2233,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX7LESS-DPP-NEXT:  .LBB4_3:
@@ -2379,18 +2379,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX7LESS-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX7LESS-NEXT:    v_min_f32_e32 v2, v1, v2
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB5_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -2404,22 +2404,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7LESS-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v4
-; GFX7LESS-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB5_4
 ; GFX7LESS-NEXT:  .LBB5_5:
@@ -2744,20 +2744,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 1.0, v0
 ; GFX7LESS-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v1, 1.0, v3
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v2, v1, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4275,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4300,26 +4300,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
@@ -4331,16 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
@@ -5424,24 +5424,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX7LESS-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB8_2
 ; GFX7LESS-NEXT:  .LBB8_3:
@@ -5590,24 +5590,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX7LESS-DPP-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB8_2
 ; GFX7LESS-DPP-NEXT:  .LBB8_3:
@@ -5777,12 +5777,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX7LESS-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
@@ -5790,7 +5790,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX7LESS-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB9_1
 ; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
@@ -5804,25 +5804,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX7LESS-NEXT:  .LBB9_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX7LESS-NEXT:    v_min_f64 v[6:7], v[0:1], v[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB9_4
 ; GFX7LESS-NEXT:  .LBB9_5:
@@ -6191,23 +6191,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[6:7], v[6:7]
-; GFX7LESS-DPP-NEXT:    v_min_f64 v[4:5], v[2:3], v[0:1]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7821,20 +7821,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_movk_i32 s32, 0x800
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7846,26 +7846,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[52:53], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1064-NEXT:    s_mov_b64 s[54:55], 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[52:53]
+; GFX1064-NEXT:    global_load_dwordx2 v[4:5], v0, s[52:53]
 ; GFX1064-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1064-NEXT:    s_getpc_b64 s[0:1]
 ; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
 ; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[64:67], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_store_dword v5, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0
 ; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
 ; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, s52
+; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
+; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
@@ -7877,16 +7877,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    s_mov_b32 s13, s50
 ; GFX1064-NEXT:    s_mov_b32 s14, s33
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[66:67]
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[64:67], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[64:67], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s53
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT:    buffer_store_dword v1, off, s[64:67], 0 offset:12
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[64:67], 0 offset:8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[64:67], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[64:67], 0 offset:4
+; GFX1064-NEXT:    buffer_load_dword v4, off, s[64:67], 0
+; GFX1064-NEXT:    buffer_load_dword v5, off, s[64:67], 0 offset:4
 ; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX1064-NEXT:    s_or_b64 s[54:55], vcc, s[54:55]
@@ -8970,20 +8970,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB12_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB12_2
 ; GFX7LESS-NEXT:  .LBB12_3:
@@ -9095,20 +9095,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB12_2
 ; GFX7LESS-DPP-NEXT:  .LBB12_3:
@@ -9224,20 +9224,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX7LESS-NEXT:  .LBB13_3:
@@ -9349,20 +9349,20 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v2
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX7LESS-DPP-NEXT:  .LBB13_3:

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index b6fa8d7e32199..9db3c37045ccf 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -34,19 +34,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX7LESS-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
 ; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_2
 ; GFX7LESS-NEXT:  .LBB0_3:
@@ -235,19 +235,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
 ; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB0_2
 ; GFX7LESS-DPP-NEXT:  .LBB0_3:
@@ -476,20 +476,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB1_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB1_4
 ; GFX7LESS-NEXT:  .LBB1_5:
@@ -848,20 +848,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v2, v3, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1275,19 +1275,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX7LESS-NEXT:  .LBB2_3:
@@ -1525,19 +1525,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX7LESS-DPP-NEXT:  .LBB2_3:
@@ -1806,20 +1806,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB3_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB3_4
 ; GFX7LESS-NEXT:  .LBB3_5:
@@ -2178,20 +2178,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v2, v3, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2605,19 +2605,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX7LESS-NEXT:  .LBB4_3:
@@ -2855,19 +2855,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX7LESS-DPP-NEXT:  .LBB4_3:
@@ -3136,20 +3136,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB5_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB5_4
 ; GFX7LESS-NEXT:  .LBB5_5:
@@ -3508,20 +3508,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v2, v3, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3962,20 +3962,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB6_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB6_4
 ; GFX7LESS-NEXT:  .LBB6_5:
@@ -4334,20 +4334,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v2, v3, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4761,19 +4761,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ; GFX7LESS-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX7LESS-NEXT:  .LBB7_3:
@@ -5011,19 +5011,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ; GFX7LESS-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX7LESS-DPP-NEXT:  .LBB7_3:
@@ -5291,20 +5291,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB8_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-NEXT:    v_sub_f32_e32 v3, v4, v2
+; GFX7LESS-NEXT:    v_sub_f32_e32 v0, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB8_4
 ; GFX7LESS-NEXT:  .LBB8_5:
@@ -5663,20 +5663,20 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_sub_f32_e32 v2, v3, v0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[1:2], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8332,23 +8332,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX7LESS-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
 ; GFX7LESS-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_add_f64 v[6:7], v[8:9], -v[4:5]
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX7LESS-NEXT:  .LBB11_3:
@@ -8589,23 +8589,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-DPP-NEXT:    v_add_f64 v[6:7], v[8:9], -v[4:5]
+; GFX7LESS-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX7LESS-DPP-NEXT:  .LBB11_3:
@@ -8878,23 +8878,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB12_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_add_f64 v[6:7], v[8:9], -v[4:5]
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB12_4
 ; GFX7LESS-NEXT:  .LBB12_5:
@@ -9268,23 +9268,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f64 v[4:5], v[6:7], -v[0:1]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9764,23 +9764,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
 ; GFX7LESS-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_add_f64 v[6:7], v[8:9], -v[4:5]
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX7LESS-NEXT:  .LBB13_3:
@@ -10021,23 +10021,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX7LESS-DPP-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
 ; GFX7LESS-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s8
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-DPP-NEXT:    v_add_f64 v[6:7], v[8:9], -v[4:5]
+; GFX7LESS-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB13_2
 ; GFX7LESS-DPP-NEXT:  .LBB13_3:
@@ -10311,23 +10311,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB14_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_add_f64 v[6:7], v[8:9], -v[4:5]
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB14_4
 ; GFX7LESS-NEXT:  .LBB14_5:
@@ -10701,23 +10701,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f64 v[4:5], v[6:7], -v[0:1]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11226,23 +11226,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7LESS-NEXT:  .LBB15_4: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_add_f64 v[6:7], v[8:9], -v[4:5]
+; GFX7LESS-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX7LESS-NEXT:  .LBB15_5:
@@ -11616,23 +11616,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
 ; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7LESS-DPP-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v5
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_add_f64 v[4:5], v[6:7], -v[0:1]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[2:5], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[6:7]
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 2cf76554078a7..17a5f520ff41e 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -5863,6 +5863,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, s13
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, s14
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, s15
+; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
+; NOOPT-NEXT:    v_writelane_b32 v32, s0, 5
+; NOOPT-NEXT:    v_writelane_b32 v32, s1, 6
+; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill
@@ -5879,12 +5885,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    buffer_store_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
-; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
-; NOOPT-NEXT:    v_writelane_b32 v32, s0, 5
-; NOOPT-NEXT:    v_writelane_b32 v32, s1, 6
-; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
-; NOOPT-NEXT:    buffer_store_dword v32, off, s[28:31], 0 ; 4-byte Folded Spill
-; NOOPT-NEXT:    s_mov_b64 exec, s[26:27]
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[26:27], -1
@@ -5903,12 +5903,19 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
 ; NOOPT-NEXT:    buffer_load_dword v6, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v7, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v8, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(6)
 ; NOOPT-NEXT:    buffer_load_dword v9, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(5)
 ; NOOPT-NEXT:    buffer_load_dword v10, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(4)
 ; NOOPT-NEXT:    buffer_load_dword v11, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(3)
 ; NOOPT-NEXT:    buffer_load_dword v12, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(2)
 ; NOOPT-NEXT:    buffer_load_dword v13, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(1)
 ; NOOPT-NEXT:    buffer_load_dword v14, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(0)
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v16, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload
@@ -8997,26 +9004,27 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; GENERIC-LABEL: broken_phi_bb:
 ; GENERIC:       ; %bb.0: ; %bb
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GENERIC-NEXT:    s_mov_b32 s4, 8
+; GENERIC-NEXT:    s_mov_b32 s6, 8
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    s_branch .LBB26_2
 ; GENERIC-NEXT:  .LBB26_1: ; %Flow
 ; GENERIC-NEXT:    ; in Loop: Header=BB26_2 Depth=1
 ; GENERIC-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GENERIC-NEXT:    s_mov_b32 s4, s1
 ; GENERIC-NEXT:    s_cbranch_vccz .LBB26_4
 ; GENERIC-NEXT:  .LBB26_2: ; %bb2
 ; GENERIC-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_cmp_ge_i32 s4, s0
+; GENERIC-NEXT:    s_cmp_ge_i32 s6, s0
 ; GENERIC-NEXT:    s_mov_b64 s[4:5], -1
+; GENERIC-NEXT:    ; implicit-def: $sgpr6
 ; GENERIC-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GENERIC-NEXT:  ; %bb.3: ; %bb4
 ; GENERIC-NEXT:    ; in Loop: Header=BB26_2 Depth=1
 ; GENERIC-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
 ; GENERIC-NEXT:    s_mov_b64 s[4:5], 0
+; GENERIC-NEXT:    s_mov_b32 s6, s1
 ; GENERIC-NEXT:    s_branch .LBB26_1
 ; GENERIC-NEXT:  .LBB26_4: ; %bb8
 ; GENERIC-NEXT:    s_endpgm
@@ -9057,8 +9065,8 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_cmp_ge_i32_e64 s[2:3], v0, s2
 ; NOOPT-NEXT:    v_mov_b32_e32 v0, s4
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    v_writelane_b32 v18, s0, 2
 ; NOOPT-NEXT:    v_writelane_b32 v18, s1, 3
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
@@ -9104,30 +9112,30 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    v_mov_b32_e32 v13, s17
 ; NOOPT-NEXT:    v_mov_b32_e32 v14, s18
 ; NOOPT-NEXT:    v_mov_b32_e32 v15, s19
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill
-; NOOPT-NEXT:    v_mov_b32_e32 v0, s0
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    v_mov_b32_e32 v16, s0
+; NOOPT-NEXT:    buffer_store_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[0:1], exec
 ; NOOPT-NEXT:    v_writelane_b32 v18, s0, 4
 ; NOOPT-NEXT:    v_writelane_b32 v18, s1, 5
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; NOOPT-NEXT:    buffer_store_dword v18, off, s[24:27], 0 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:  .LBB26_3: ; Parent Loop BB26_1 Depth=1
 ; NOOPT-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -9138,23 +9146,30 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v18, 6
 ; NOOPT-NEXT:    v_readlane_b32 s1, v18, 7
-; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v3, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v4, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v5, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v6, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v7, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v8, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v9, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v10, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v11, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v12, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v13, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v14, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
-; NOOPT-NEXT:    buffer_load_dword v16, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(6)
+; NOOPT-NEXT:    buffer_load_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(5)
+; NOOPT-NEXT:    buffer_load_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(4)
+; NOOPT-NEXT:    buffer_load_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(3)
+; NOOPT-NEXT:    buffer_load_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(2)
+; NOOPT-NEXT:    buffer_load_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(1)
+; NOOPT-NEXT:    buffer_load_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_waitcnt expcnt(0)
+; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT:    buffer_load_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readfirstlane_b32 s2, v17
@@ -9178,22 +9193,22 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
-; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT:    buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
 ; NOOPT-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; NOOPT-NEXT:    v_writelane_b32 v18, s2, 6
 ; NOOPT-NEXT:    v_writelane_b32 v18, s3, 7
@@ -9231,9 +9246,9 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    buffer_load_dword v13, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v14, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload
 ; NOOPT-NEXT:    buffer_load_dword v15, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT:    s_mov_b64 s[0:1], 0
 ; NOOPT-NEXT:    s_waitcnt vmcnt(14)
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
-; NOOPT-NEXT:    s_mov_b64 s[0:1], 0
 ; NOOPT-NEXT:    v_writelane_b32 v18, s0, 2
 ; NOOPT-NEXT:    v_writelane_b32 v18, s1, 3
 ; NOOPT-NEXT:    s_or_saveexec_b64 s[20:21], -1

diff  --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
index 044fb2abd5cc1..fab5d386446d3 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll
@@ -28,29 +28,29 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GCN-NEXT:  ; %bb.1: ; %atomic
 ; GCN-NEXT:    s_mov_b32 s8, s10
 ; GCN-NEXT:    s_mov_b32 s9, s10
-; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:400
+; GCN-NEXT:    buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0xf
 ; GCN-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN-NEXT:  .LBB0_2: ; %atomicrmw.start
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_max_i32_e32 v3, s4, v4
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-NEXT:    v_mov_b32_e32 v5, v3
-; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_max_i32_e32 v4, s4, v5
-; GCN-NEXT:    v_mov_b32_e32 v3, v4
-; GCN-NEXT:    v_mov_b32_e32 v4, v5
-; GCN-NEXT:    buffer_atomic_cmpswap v[3:4], v[1:2], s[8:11], 0 addr64 offset:400 glc
+; GCN-NEXT:    buffer_atomic_cmpswap v[5:6], v[1:2], s[8:11], 0 addr64 offset:400 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_wbinvl1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GCN-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v4, v5
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GCN-NEXT:    s_cbranch_execnz .LBB0_2
 ; GCN-NEXT:  ; %bb.3: ; %atomicrmw.end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], 0
 ; GCN-NEXT:  .LBB0_4: ; %exit
 ; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -87,22 +87,22 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs
 ; GCN-NEXT:  ; %bb.1: ; %atomic
 ; GCN-NEXT:    s_mov_b32 s0, s2
 ; GCN-NEXT:    s_mov_b32 s1, s2
-; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 offset:400
+; GCN-NEXT:    buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:400
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0xf
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:  .LBB1_2: ; %atomicrmw.start
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_max_i32_e32 v3, s6, v4
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-NEXT:    v_mov_b32_e32 v5, v3
-; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_max_i32_e32 v4, s6, v5
-; GCN-NEXT:    v_mov_b32_e32 v3, v4
-; GCN-NEXT:    v_mov_b32_e32 v4, v5
-; GCN-NEXT:    buffer_atomic_cmpswap v[3:4], v[1:2], s[0:3], 0 addr64 offset:400 glc
+; GCN-NEXT:    buffer_atomic_cmpswap v[5:6], v[1:2], s[0:3], 0 addr64 offset:400 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_wbinvl1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v4, v5
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_cbranch_execnz .LBB1_2
 ; GCN-NEXT:  .LBB1_3: ; %exit

diff  --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b1e6f47fdf2c7..b5e7589cbd134 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2556,32 +2556,32 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; EG-NEXT:    Fetch clause starting at 12:
 ; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 14:
-; EG-NEXT:     OR_INT T1.W, KC0[2].W, KC0[3].X,
-; EG-NEXT:     MOV * T0.W, literal.x,
+; EG-NEXT:     OR_INT T0.W, KC0[2].W, KC0[3].X,
+; EG-NEXT:     MOV * T1.W, literal.x,
 ; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
-; EG-NEXT:     SETNE_INT * T1.W, PV.W, 0.0,
+; EG-NEXT:     SETNE_INT * T0.W, PV.W, 0.0,
 ; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
 ; EG-NEXT:    ALU clause starting at 19:
-; EG-NEXT:     MOV T1.W, KC0[2].W,
-; EG-NEXT:     MOV * T0.W, KC0[3].Z,
+; EG-NEXT:     MOV T0.W, KC0[2].W,
+; EG-NEXT:     MOV * T1.W, KC0[3].Z,
 ; EG-NEXT:     MOV T2.W, KC0[3].Y,
 ; EG-NEXT:     MULLO_INT * T0.X, PV.W, PS,
-; EG-NEXT:     MOV T0.W, KC0[3].X,
-; EG-NEXT:     MULHI * T0.Y, T1.W, PV.W,
+; EG-NEXT:     MOV T1.W, KC0[3].X,
+; EG-NEXT:     MULHI * T0.Y, T0.W, PV.W,
 ; EG-NEXT:     ADD_INT T3.W, PS, T0.X,
 ; EG-NEXT:     MULLO_INT * T0.X, PV.W, T2.W,
 ; EG-NEXT:     ADD_INT T0.Y, PV.W, PS,
-; EG-NEXT:     MOV T0.W, literal.x,
-; EG-NEXT:     MULLO_INT * T0.X, T1.W, T2.W,
+; EG-NEXT:     MOV T1.W, literal.x,
+; EG-NEXT:     MULLO_INT * T0.X, T0.W, T2.W,
 ; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
 ; EG-NEXT:    ALU clause starting at 31:
-; EG-NEXT:     MOV T1.W, KC0[2].Y,
-; EG-NEXT:     SETE_INT * T0.W, T0.W, 0.0,
+; EG-NEXT:     MOV T0.W, KC0[2].Y,
+; EG-NEXT:     SETE_INT * T1.W, T1.W, 0.0,
 ; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
 ; EG-NEXT:    ALU clause starting at 34:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 35:
-; EG-NEXT:     LSHR * T1.X, T1.W, literal.x,
+; EG-NEXT:     LSHR * T1.X, T0.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %0 = icmp eq i64 %a, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index ab4e7e50539f6..ba9dd8f7c2468 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -240,31 +240,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v2
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
@@ -274,34 +270,34 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[3:4], s4, v[3:4]
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v12
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[2:3], s4, v[2:3]
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s10, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s10, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s10, 2
 ; GFX9-O0-NEXT:    s_mov_b32 s11, s7
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s11, 3
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, s10, v0
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s11, 3
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, s10, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v4, v1, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v4, v3, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v2, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v0, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v1, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
@@ -309,25 +305,25 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[11:12], s[4:5]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[4:5]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[4:5]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v17
@@ -466,18 +462,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9]
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
 ; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
 ; GFX9-O0-NEXT:    v_min_u32_e64 v5, v4, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
 ; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s13
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v1
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v3
 ; GFX9-O0-NEXT:    v_min_u32_e64 v11, v4, v10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s12
@@ -564,33 +560,33 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s4, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s5, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 5
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -598,64 +594,67 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB0_8
 ; GFX9-O0-NEXT:  .LBB0_1: ; %Flow
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-O0-NEXT:  .LBB0_3: ; %Flow2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -680,408 +679,408 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_3
 ; GFX9-O0-NEXT:  .LBB0_5: ; %Flow1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 9
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v29, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v29, 11
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 11
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[26:27]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v27
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v14, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[2:3]
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], s4, v[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v22
+; GFX9-O0-NEXT:    v_or_b32_e64 v22, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v26
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v3, v14
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v28
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[10:11]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v25
-; GFX9-O0-NEXT:    v_or3_b32 v10, v10, v11, v28
+; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v24
-; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
-; GFX9-O0-NEXT:    v_or_b32_e64 v10, v10, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v7, vcc, v7, v10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v16, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v14, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v11, vcc
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[5:6], s4, v[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s5
-; GFX9-O0-NEXT:    v_and_b32_e64 v4, v15, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v5
+; GFX9-O0-NEXT:    v_and_b32_e64 v12, v7, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
 ; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
-; GFX9-O0-NEXT:    v_and_b32_e64 v6, v17, s4
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-O0-NEXT:    v_and_b32_e64 v14, v11, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT:    v_and_b32_e64 v22, v15, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v20, v17, v20
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
 ; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT:    v_and_b32_e64 v15, v15, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v17, v17, v18
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v17
-; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v21
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v10, vcc, v10, v19
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v16, vcc, v16, v18, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v14, vcc, v14, v17, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v15, vcc
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v16
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v8
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
-; GFX9-O0-NEXT:    v_or_b32_e64 v10, v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
-; GFX9-O0-NEXT:    v_or_b32_e64 v8, v8, v9
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v18, v18, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v4
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v14
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 10
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 11
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v8, v[18:19]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v8
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[16:17]
+; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v22
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v14, v15
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v15
-; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[6:7], v8, s4
-; GFX9-O0-NEXT:    v_sub_u32_e64 v11, v8, s4
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[20:21], v11, v[16:17]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v11, v11, v22, s[6:7]
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v22
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v7
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[6:7], v4, s4
+; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s4
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[20:21], v5, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, s4
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v11, v11, v22, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v20
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v18
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v20
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v18
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[16:17], v8, v[16:17]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v16, v11, v16, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s7, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s7
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v11, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s7
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v11, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s6
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s7
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s6
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s7
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s6
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v13
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s4
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s4, 10
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s5, 11
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 11
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -1094,118 +1093,118 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v7, vcc, v3, v4
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v2, vcc
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v8
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[0:1], v2, v[11:12]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[9:10]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v15
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
-; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
 ; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v2, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 63
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s10, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v3, v[9:10]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v14
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[4:5]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 0
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v2, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v13
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], v2, v[9:10]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s9
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_5
@@ -1221,10 +1220,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
@@ -1498,7 +1497,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s4, v[3:4]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1712,7 +1711,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
@@ -1789,16 +1788,16 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 1
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr30 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 1
 ; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
-; GFX9-O0-NEXT:    v_or_b32_e64 v8, v0, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v14, v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v8, v2, v0
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v14
 ; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
@@ -1843,18 +1842,18 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
 ; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
 ; GFX9-O0-NEXT:    v_min_u32_e64 v5, v4, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v4, v2
 ; GFX9-O0-NEXT:    v_add_u32_e64 v4, v4, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v1
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v3
 ; GFX9-O0-NEXT:    v_min_u32_e64 v14, v4, v10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
@@ -1945,33 +1944,33 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s5, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 3
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1979,47 +1978,50 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_8
 ; GFX9-O0-NEXT:  .LBB1_1: ; %Flow
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_5
 ; GFX9-O0-NEXT:  .LBB1_3: ; %Flow2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 3
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -2061,29 +2063,29 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB1_3
 ; GFX9-O0-NEXT:  .LBB1_5: ; %Flow1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v29, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v29, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v30, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v30, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -2101,214 +2103,214 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:  .LBB1_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v29, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v29, 9
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v30, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v30, 9
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s4, v[2:3]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v7
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[28:29], s4, v[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX9-O0-NEXT:    s_mov_b32 s5, 1
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[26:27]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v27
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v14, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v26
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[2:3]
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], s4, v[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[22:23], s5, v[22:23]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v23
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v22
+; GFX9-O0-NEXT:    v_or_b32_e64 v22, v5, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[2:3]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], s4, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v26
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v3, v14
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v28
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s5, v[0:1]
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[26:27], s5, v[10:11]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[28:29], s5, v[6:7]
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v29
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v25
-; GFX9-O0-NEXT:    v_or3_b32 v10, v10, v11, v28
+; GFX9-O0-NEXT:    v_or3_b32 v6, v6, v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v24
-; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v26
+; GFX9-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v17
-; GFX9-O0-NEXT:    v_or_b32_e64 v10, v10, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v25
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v24
 ; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v23
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v7, vcc, v7, v10
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v16, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v14, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v11, vcc
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[5:6], s4, v[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[11:12]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s5
-; GFX9-O0-NEXT:    v_and_b32_e64 v4, v15, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v5
+; GFX9-O0-NEXT:    v_and_b32_e64 v12, v7, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
 ; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
-; GFX9-O0-NEXT:    v_and_b32_e64 v6, v17, s4
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-O0-NEXT:    v_and_b32_e64 v14, v11, s4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT:    v_and_b32_e64 v22, v15, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v20, v17, v20
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v20, v11, v20
 ; GFX9-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v22
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT:    v_and_b32_e64 v15, v15, v22
-; GFX9-O0-NEXT:    v_and_b32_e64 v17, v17, v18
-; GFX9-O0-NEXT:    ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v17
-; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 killed $vgpr17_vgpr18 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v21
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v10, vcc, v10, v19
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v16, vcc, v16, v18, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v14, vcc, v14, v17, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v15, vcc
+; GFX9-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX9-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX9-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v16
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s4, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s5
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s5
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v19, vcc, v11, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s4
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v8
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
-; GFX9-O0-NEXT:    v_or_b32_e64 v10, v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
-; GFX9-O0-NEXT:    v_or_b32_e64 v8, v8, v9
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v18, v18, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[16:17], v[12:13]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v4
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v14
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 5
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
@@ -2326,128 +2328,128 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_1
 ; GFX9-O0-NEXT:  .LBB1_7: ; %udiv-preheader
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[14:15], v8, v[18:19]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
-; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v8
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[16:17]
+; GFX9-O0-NEXT:    v_sub_u32_e64 v20, s4, v4
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v21
-; GFX9-O0-NEXT:    v_or_b32_e64 v11, v11, v22
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v20
-; GFX9-O0-NEXT:    v_or_b32_e64 v14, v14, v15
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v15
-; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[6:7], v8, s4
-; GFX9-O0-NEXT:    v_sub_u32_e64 v11, v8, s4
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[20:21], v11, v[16:17]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v21
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v11, v11, v22, s[6:7]
+; GFX9-O0-NEXT:    v_or_b32_e64 v5, v5, v22
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v20
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v7
+; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[6:7], v4, s4
+; GFX9-O0-NEXT:    v_sub_u32_e64 v5, v4, s4
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[20:21], v5, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0
-; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, s4
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v19
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v11, v11, v22, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v20
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v18
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v20
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v18
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v11
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[16:17], v8, v[16:17]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v17
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[14:15]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v5
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[6:7]
-; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v16, v11, v16, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v8
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], -1
 ; GFX9-O0-NEXT:    s_mov_b32 s7, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s6, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s7
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v11, v12, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s7
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v11, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s6
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s7
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s6
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s7
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s6
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v13
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s4
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s4
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s4, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s5, 9
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s4, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s5, 9
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
@@ -2457,12 +2459,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_branch .LBB1_6
 ; GFX9-O0-NEXT:  .LBB1_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -2475,118 +2477,118 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-O0-NEXT:    s_mov_b32 s8, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s9, s7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-O0-NEXT:    v_add_co_u32_e32 v7, vcc, v3, v4
+; GFX9-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v2, vcc
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v8
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v2, s4, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[0:1], v2, v[11:12]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 64
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v13, s4, v2
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[9:10]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[13:14], v13, v[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v14
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v15
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
-; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_or_b32_e64 v12, v12, v15
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
 ; GFX9-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v2, s4
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 63
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s10, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[13:14], v3, v[9:10]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v14
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[4:5]
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[4:5]
 ; GFX9-O0-NEXT:    s_mov_b32 s10, 0
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[10:11], v2, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[10:11]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v13
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[10:11]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], v2, v[9:10]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s9
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_or_b32_e64 v6, v6, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT:    v_or_b32_e64 v4, v4, v5
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[0:1], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s6, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v29, s7, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v30, s7, 7
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX9-O0-NEXT:    buffer_store_dword v29, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_5
@@ -2843,7 +2845,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s4, v[3:4]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 0981584598abd..df496258a2509 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -369,12 +369,12 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v18, v2, v3
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v2, v3
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v6
 ; GCN-IR-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v7
-; GCN-IR-NEXT:    v_min_u32_e32 v19, v2, v3
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[6:7], v18, v19
+; GCN-IR-NEXT:    v_min_u32_e32 v11, v2, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[6:7], v10, v11
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[6:7]
 ; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7]
@@ -399,47 +399,47 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[6:7], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v4, v18
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[6:7], v8
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, v4, v19
-; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[8:9], -1, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_not_b32_e32 v4, v10
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[6:7], v8
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, v4, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[4:5], -1, 0, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v16, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v17, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v16, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v17, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v18, 31, v4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v18
-; GCN-IR-NEXT:    v_and_b32_e32 v19, v18, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v18, v18, v0
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v18
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v11, v19, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB1_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v0
@@ -1420,9 +1420,9 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v2, v3
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v10
 ; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1444,46 +1444,46 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], 24, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB11_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], 24, v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB11_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v14, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v14, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v15, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v16
-; GCN-IR-NEXT:    v_and_b32_e32 v17, v16, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v16, v16, v0
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v16
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v11, v17, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB11_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB11_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v0
@@ -1613,9 +1613,9 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v2, v3
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v10
 ; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1631,54 +1631,54 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
+; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[8:9], v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v14, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v14, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v15, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v16
-; GCN-IR-NEXT:    v_and_b32_e32 v17, v16, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v16, v16, v0
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v16
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v11, v17, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB12_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v0
@@ -1715,8 +1715,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v4
 ; GCN-IR-NEXT:    v_add_i32_e64 v0, s[4:5], 32, v0
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v1, v5
-; GCN-IR-NEXT:    v_min_u32_e32 v12, v0, v1
-; GCN-IR-NEXT:    v_sub_i32_e64 v0, s[4:5], 48, v12
+; GCN-IR-NEXT:    v_min_u32_e32 v8, v0, v1
+; GCN-IR-NEXT:    v_sub_i32_e64 v0, s[4:5], 48, v8
 ; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[0:1]
@@ -1738,44 +1738,44 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB13_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[4:5], v6
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffcf, v12
-; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[8:9], 0, -1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], v[4:5], v6
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffcf, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[4:5], 0, -1, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  .LBB13_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v2
-; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s10, v8
-; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s12, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 0x8000, v8
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v12, 0x8000, v12
-; GCN-IR-NEXT:    v_or_b32_e32 v1, v7, v1
-; GCN-IR-NEXT:    v_or_b32_e32 v0, v6, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, v3
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v8, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, v2
-; GCN-IR-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v6, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB13_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB13_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v3, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v2, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index a5cb9a507bd89..47dfa9f4fc2d3 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -346,12 +346,12 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
 ; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT:    v_min_u32_e32 v10, v4, v5
+; GCN-IR-NEXT:    v_min_u32_e32 v12, v4, v5
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v11, v4, v5
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[6:7], v10, v11
+; GCN-IR-NEXT:    v_min_u32_e32 v13, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[6:7], v12, v13
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7]
@@ -375,47 +375,47 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v6, v10
-; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v8
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v11
-; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[8:9], -1, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    v_not_b32_e32 v6, v12
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[4:5], -1, 0, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v16, v12
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v17, v13, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v16, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v17, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v18, 31, v6
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v18
-; GCN-IR-NEXT:    v_and_b32_e32 v19, v18, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v18, v18, v2
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, v12, v18
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
-; GCN-IR-NEXT:    v_subb_u32_e32 v13, vcc, v13, v19, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB1_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v5
 ; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
@@ -1538,9 +1538,9 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v2, v3
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v10
 ; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1561,46 +1561,46 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], 24, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB11_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], 24, v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB11_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v14
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v0
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v14
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v11, v15, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB11_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB11_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2
@@ -1729,9 +1729,9 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v2, v3
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffd0
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s6, v10
 ; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1746,54 +1746,54 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
+; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[8:9], v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v14
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v0
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v14
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v11, v15, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB12_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2
@@ -1836,8 +1836,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v2, v3
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 48, v8
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 48, v10
 ; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[2:3]
@@ -1859,44 +1859,44 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB13_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v6
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffcf, v8
-; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[8:9], 0, -1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v6
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffcf, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB13_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s10, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s12, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v4
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v10, 0x8000, v10
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v14
-; GCN-IR-NEXT:    v_and_b32_e32 v14, 0x8000, v14
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v14
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB13_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB13_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index f1d7c84836ca7..e9017939f8a4a 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -341,47 +341,47 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v2
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v10
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v10
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v3, vcc
 ; GCN-IR-NEXT:    v_not_b32_e32 v0, v14
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v15
-; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[8:9], -1, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], -1, 0, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v12, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v13, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v6
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v14
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v2
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v9, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v8, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v7
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v14
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v6
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v11, v15, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB1_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v7, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v6, v0
@@ -1065,8 +1065,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v14, v2, v3
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffd0, v14
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffd0, v10
 ; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[6:7]
@@ -1081,54 +1081,54 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB9_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v6
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v6
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v14
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[8:9], v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v14
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v0
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v14
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v11, v15, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB9_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB9_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
@@ -1177,44 +1177,44 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v8
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], v[0:1], v8
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffcf, v10
-; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[8:9], 0, -1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB10_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s10, v8
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s12, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v4
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 0x8000, v8
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT:    v_and_b32_e32 v10, 0x8000, v10
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v7, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v6, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v8, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, v4
-; GCN-IR-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v6, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB10_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB10_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0
@@ -1377,43 +1377,43 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v8
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], v[0:1], v8
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc4, v10
-; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[8:9], 0, -1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, -1, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB12_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 23, v8
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 23, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v4
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 24, v8
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
-; GCN-IR-NEXT:    v_and_b32_e32 v10, 24, v10
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v7, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v6, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v8, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, v4
-; GCN-IR-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[4:5], v6, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB12_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB12_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v2, v5, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v4, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index bd742968ba37b..6480a88d40f5a 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -322,12 +322,12 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
 ; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT:    v_min_u32_e32 v10, v4, v5
+; GCN-IR-NEXT:    v_min_u32_e32 v12, v4, v5
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v11, v4, v5
-; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[6:7], v10, v11
+; GCN-IR-NEXT:    v_min_u32_e32 v13, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[6:7], v12, v13
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7]
@@ -350,47 +350,47 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v6, v10
-; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v8
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v11
-; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[8:9], -1, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    v_not_b32_e32 v6, v12
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[4:5], -1, 0, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:  .LBB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v12
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v13, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v16, 31, v6
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v16
-; GCN-IR-NEXT:    v_and_b32_e32 v17, v16, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v16, v16, v2
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v11, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v10, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v12, vcc, v12, v16
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
-; GCN-IR-NEXT:    v_subb_u32_e32 v13, vcc, v13, v17, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB1_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v5
 ; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v4
@@ -1166,8 +1166,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v2, v3
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffd0, v8
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffd0, v10
 ; GCN-IR-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[2:3]
@@ -1182,54 +1182,54 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB8_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v2
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
+; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[8:9], v6
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v8
-; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[8:9], 0, 0, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v6
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB8_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v11, v10, v1
+; GCN-IR-NEXT:    v_and_b32_e32 v10, v10, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v14
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v1
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v0
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v14
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v11, v15, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB8_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB8_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2
@@ -1262,8 +1262,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v2, v3
-; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 48, v8
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v2, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 48, v10
 ; GCN-IR-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], 63, v[2:3]
@@ -1284,44 +1284,44 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], v[0:1], v6
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffcf, v8
-; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[8:9], 0, -1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    s_movk_i32 s10, 0x7fff
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], v[0:1], v6
+; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffcf, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, -1, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    s_movk_i32 s12, 0x7fff
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s10, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, s12, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
 ; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v10, 0x8000, v10
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v12, 0x8000, v12
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v9, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v8, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v5
-; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v10, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-IR-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
-; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz .LBB9_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:  .LBB9_5: ; %Flow4
-; GCN-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, v5, v3
 ; GCN-IR-NEXT:    v_or_b32_e32 v4, v4, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 310f3ad04917b..a401f989a2507 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -508,12 +508,12 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    v_and_b32_e32 v16, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 3, v16
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 3, v16
 ; GFX942-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v16
 ; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v16
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[8:9]
-; GFX942-NEXT:    ; implicit-def: $vgpr0
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT:    ; implicit-def: $vgpr2
 ; GFX942-NEXT:    ; implicit-def: $vgpr12
 ; GFX942-NEXT:    ; implicit-def: $vgpr10
 ; GFX942-NEXT:    ; implicit-def: $vgpr13
@@ -521,75 +521,74 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942-NEXT:    ; implicit-def: $vgpr11
 ; GFX942-NEXT:    ; implicit-def: $vgpr15
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; GFX942-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX942-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX942-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX942-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX942-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX942-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX942-NEXT:    v_lshrrev_b32_e32 v9, 8, v0
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v1, s[10:11]
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v3, s[10:11]
 ; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v16
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX942-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; GFX942-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX942-NEXT:    v_mov_b32_e32 v5, 7
 ; GFX942-NEXT:    v_mov_b32_e32 v6, 6
-; GFX942-NEXT:    v_mov_b32_e32 v3, 5
+; GFX942-NEXT:    v_mov_b32_e32 v1, 5
 ; GFX942-NEXT:    v_mov_b32_e32 v7, 4
 ; GFX942-NEXT:    v_mov_b32_e32 v8, 3
 ; GFX942-NEXT:    v_mov_b32_e32 v9, 2
-; GFX942-NEXT:    v_mov_b32_e32 v2, 1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v13, 24, v0
-; GFX942-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX942-NEXT:    v_lshrrev_b32_e32 v12, 8, v0
+; GFX942-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX942-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX942-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
 ; GFX942-NEXT:  .LBB10_2: ; %Flow
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX942-NEXT:  ; %bb.3: ; %bb.2
-; GFX942-NEXT:    v_lshlrev_b16_e32 v0, 8, v9
-; GFX942-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
-; GFX942-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b16_e32 v2, 8, v9
+; GFX942-NEXT:    v_lshlrev_b16_e32 v3, 8, v7
+; GFX942-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX942-NEXT:    v_lshlrev_b16_e32 v11, 8, v4
-; GFX942-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
-; GFX942-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
+; GFX942-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX942-NEXT:    v_or_b32_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX942-NEXT:    v_mov_b32_e32 v10, 0
-; GFX942-NEXT:    v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    global_store_dwordx2 v10, v[0:1], s[12:13]
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-NEXT:    v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    global_store_dwordx2 v10, v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v12, v9
 ; GFX942-NEXT:    v_mov_b32_e32 v10, v8
 ; GFX942-NEXT:    v_mov_b32_e32 v13, v7
-; GFX942-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-NEXT:    v_mov_b32_e32 v14, v6
 ; GFX942-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX942-NEXT:    v_mov_b32_e32 v15, v4
 ; GFX942-NEXT:  .LBB10_4: ; %bb.3
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_lshlrev_b16_e32 v3, 8, v12
-; GFX942-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_lshlrev_b16_e32 v3, 8, v13
-; GFX942-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
-; GFX942-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    v_lshlrev_b16_e32 v3, 8, v14
-; GFX942-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    v_lshlrev_b16_e32 v3, 8, v15
-; GFX942-NEXT:    v_or_b32_sdwa v3, v11, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
+; GFX942-NEXT:    v_lshlrev_b16_e32 v0, 8, v12
+; GFX942-NEXT:    v_lshlrev_b16_e32 v1, 8, v13
+; GFX942-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b16_e32 v2, 8, v15
+; GFX942-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b16_e32 v1, 8, v14
+; GFX942-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index b7c12854d1115..4212fd3b35cd8 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1292,23 +1292,23 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_clause 0x1
 ; GFX1064-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
 ; GFX1064-NEXT:    s_mov_b64 vcc, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    global_load_dwordx3 v[1:3], v1, s[10:11]
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX1064-NEXT:  ; %bb.1: ; %bb
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX1064-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_b64 vcc, vcc, exec
 ; GFX1064-NEXT:  .LBB22_2: ; %exit
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_div_fmas_f32 v1, v1, v2, v3

diff  --git a/llvm/test/CodeGen/ARM/and-cmp0-sink.ll b/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
index 389193f9d2407..fb9139c0d1285 100644
--- a/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
+++ b/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
@@ -112,20 +112,20 @@ define void @f(i32 %v, ptr noalias  %outp) {
 ; V6M-NEXT:    push {r4, lr}
 ; V6M-NEXT:    movs r2, #0
 ; V6M-NEXT:    str r2, [r1]
-; V6M-NEXT:    movs r2, #14
-; V6M-NEXT:    ands r2, r0
+; V6M-NEXT:    movs r3, #14
+; V6M-NEXT:    ands r3, r0
 ; V6M-NEXT:    movs r4, #4
 ; V6M-NEXT:    ands r4, r0
-; V6M-NEXT:    movs r3, #2
-; V6M-NEXT:    ands r3, r0
+; V6M-NEXT:    movs r2, #2
+; V6M-NEXT:    ands r2, r0
 ; V6M-NEXT:    lsls r0, r0, #31
 ; V6M-NEXT:    bne .LBB0_5
 ; V6M-NEXT:  @ %bb.1: @ %if.then
 ; V6M-NEXT:    movs r0, #129
-; V6M-NEXT:    cmp r3, #0
+; V6M-NEXT:    cmp r2, #0
 ; V6M-NEXT:    beq .LBB0_3
 ; V6M-NEXT:  @ %bb.2:
-; V6M-NEXT:    lsls r3, r0, #8
+; V6M-NEXT:    lsls r2, r0, #8
 ; V6M-NEXT:  .LBB0_3: @ %if.then
 ; V6M-NEXT:    cmp r4, #0
 ; V6M-NEXT:    beq .LBB0_10
@@ -134,22 +134,22 @@ define void @f(i32 %v, ptr noalias  %outp) {
 ; V6M-NEXT:    b .LBB0_9
 ; V6M-NEXT:  .LBB0_5: @ %if.else
 ; V6M-NEXT:    movs r0, #129
-; V6M-NEXT:    cmp r3, #0
+; V6M-NEXT:    cmp r2, #0
 ; V6M-NEXT:    beq .LBB0_7
 ; V6M-NEXT:  @ %bb.6:
-; V6M-NEXT:    lsls r3, r0, #6
+; V6M-NEXT:    lsls r2, r0, #6
 ; V6M-NEXT:  .LBB0_7: @ %if.else
 ; V6M-NEXT:    cmp r4, #0
 ; V6M-NEXT:    beq .LBB0_10
 ; V6M-NEXT:  @ %bb.8: @ %if.else
 ; V6M-NEXT:    lsls r0, r0, #5
 ; V6M-NEXT:  .LBB0_9: @ %if.else
-; V6M-NEXT:    adds r3, r3, r0
+; V6M-NEXT:    adds r2, r2, r0
 ; V6M-NEXT:  .LBB0_10: @ %if.else
-; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    cmp r3, #0
 ; V6M-NEXT:    beq .LBB0_12
 ; V6M-NEXT:  @ %bb.11: @ %if.end
-; V6M-NEXT:    str r3, [r1]
+; V6M-NEXT:    str r2, [r1]
 ; V6M-NEXT:  .LBB0_12: @ %exit
 ; V6M-NEXT:    pop {r4, pc}
 entry:

diff  --git a/llvm/test/CodeGen/ARM/cttz.ll b/llvm/test/CodeGen/ARM/cttz.ll
index e1bf4837d6a47..1146ad64ee709 100644
--- a/llvm/test/CodeGen/ARM/cttz.ll
+++ b/llvm/test/CodeGen/ARM/cttz.ll
@@ -229,24 +229,24 @@ define i64 @test_i64(i64 %a) {
 ; CHECK-6M-NEXT:    orrs r0, r3
 ; CHECK-6M-NEXT:    beq .LBB3_6
 ; CHECK-6M-NEXT:  @ %bb.1: @ %cond.false
-; CHECK-6M-NEXT:    ldr r5, .LCPI3_0
+; CHECK-6M-NEXT:    ldr r6, .LCPI3_0
 ; CHECK-6M-NEXT:    adr r4, .LCPI3_1
 ; CHECK-6M-NEXT:    movs r0, #32
 ; CHECK-6M-NEXT:    cmp r3, #0
-; CHECK-6M-NEXT:    mov r6, r0
+; CHECK-6M-NEXT:    mov r5, r0
 ; CHECK-6M-NEXT:    beq .LBB3_3
 ; CHECK-6M-NEXT:  @ %bb.2: @ %cond.false
-; CHECK-6M-NEXT:    rsbs r6, r3, #0
-; CHECK-6M-NEXT:    ands r6, r3
-; CHECK-6M-NEXT:    muls r6, r5, r6
-; CHECK-6M-NEXT:    lsrs r3, r6, #27
-; CHECK-6M-NEXT:    ldrb r6, [r4, r3]
+; CHECK-6M-NEXT:    rsbs r5, r3, #0
+; CHECK-6M-NEXT:    ands r5, r3
+; CHECK-6M-NEXT:    muls r5, r6, r5
+; CHECK-6M-NEXT:    lsrs r3, r5, #27
+; CHECK-6M-NEXT:    ldrb r5, [r4, r3]
 ; CHECK-6M-NEXT:  .LBB3_3: @ %cond.false
-; CHECK-6M-NEXT:    adds r6, #32
+; CHECK-6M-NEXT:    adds r5, #32
 ; CHECK-6M-NEXT:    rsbs r3, r2, #0
 ; CHECK-6M-NEXT:    ands r3, r2
-; CHECK-6M-NEXT:    muls r5, r3, r5
-; CHECK-6M-NEXT:    lsrs r3, r5, #27
+; CHECK-6M-NEXT:    muls r6, r3, r6
+; CHECK-6M-NEXT:    lsrs r3, r6, #27
 ; CHECK-6M-NEXT:    cmp r2, #0
 ; CHECK-6M-NEXT:    bne .LBB3_7
 ; CHECK-6M-NEXT:  @ %bb.4: @ %cond.false
@@ -260,7 +260,7 @@ define i64 @test_i64(i64 %a) {
 ; CHECK-6M-NEXT:    ldrb r0, [r4, r3]
 ; CHECK-6M-NEXT:    bne .LBB3_5
 ; CHECK-6M-NEXT:  .LBB3_8: @ %cond.false
-; CHECK-6M-NEXT:    mov r0, r6
+; CHECK-6M-NEXT:    mov r0, r5
 ; CHECK-6M-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-6M-NEXT:    .p2align 2
 ; CHECK-6M-NEXT:  @ %bb.9:
@@ -279,24 +279,24 @@ define i64 @test_i64(i64 %a) {
 ; CHECK-8MBASE-NEXT:    orrs r0, r3
 ; CHECK-8MBASE-NEXT:    beq .LBB3_6
 ; CHECK-8MBASE-NEXT:  @ %bb.1: @ %cond.false
-; CHECK-8MBASE-NEXT:    movw r5, #46385
-; CHECK-8MBASE-NEXT:    movt r5, #1916
+; CHECK-8MBASE-NEXT:    movw r6, #46385
+; CHECK-8MBASE-NEXT:    movt r6, #1916
 ; CHECK-8MBASE-NEXT:    adr r4, .LCPI3_0
 ; CHECK-8MBASE-NEXT:    movs r0, #32
-; CHECK-8MBASE-NEXT:    mov r6, r0
+; CHECK-8MBASE-NEXT:    mov r5, r0
 ; CHECK-8MBASE-NEXT:    cbz r3, .LBB3_3
 ; CHECK-8MBASE-NEXT:  @ %bb.2: @ %cond.false
-; CHECK-8MBASE-NEXT:    rsbs r6, r3, #0
-; CHECK-8MBASE-NEXT:    ands r6, r3
-; CHECK-8MBASE-NEXT:    muls r6, r5, r6
-; CHECK-8MBASE-NEXT:    lsrs r3, r6, #27
-; CHECK-8MBASE-NEXT:    ldrb r6, [r4, r3]
+; CHECK-8MBASE-NEXT:    rsbs r5, r3, #0
+; CHECK-8MBASE-NEXT:    ands r5, r3
+; CHECK-8MBASE-NEXT:    muls r5, r6, r5
+; CHECK-8MBASE-NEXT:    lsrs r3, r5, #27
+; CHECK-8MBASE-NEXT:    ldrb r5, [r4, r3]
 ; CHECK-8MBASE-NEXT:  .LBB3_3: @ %cond.false
-; CHECK-8MBASE-NEXT:    adds r6, #32
+; CHECK-8MBASE-NEXT:    adds r5, #32
 ; CHECK-8MBASE-NEXT:    rsbs r3, r2, #0
 ; CHECK-8MBASE-NEXT:    ands r3, r2
-; CHECK-8MBASE-NEXT:    muls r5, r3, r5
-; CHECK-8MBASE-NEXT:    lsrs r3, r5, #27
+; CHECK-8MBASE-NEXT:    muls r6, r3, r6
+; CHECK-8MBASE-NEXT:    lsrs r3, r6, #27
 ; CHECK-8MBASE-NEXT:    cmp r2, #0
 ; CHECK-8MBASE-NEXT:    bne .LBB3_7
 ; CHECK-8MBASE-NEXT:  @ %bb.4: @ %cond.false
@@ -310,7 +310,7 @@ define i64 @test_i64(i64 %a) {
 ; CHECK-8MBASE-NEXT:    ldrb r0, [r4, r3]
 ; CHECK-8MBASE-NEXT:    bne .LBB3_5
 ; CHECK-8MBASE-NEXT:  .LBB3_8: @ %cond.false
-; CHECK-8MBASE-NEXT:    mov r0, r6
+; CHECK-8MBASE-NEXT:    mov r0, r5
 ; CHECK-8MBASE-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-8MBASE-NEXT:    .p2align 2
 ; CHECK-8MBASE-NEXT:  @ %bb.9:
@@ -503,24 +503,24 @@ define i64 @test_i64_zero_undef(i64 %a) {
 ; CHECK-6M-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-6M-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-6M-NEXT:    mov r2, r0
-; CHECK-6M-NEXT:    ldr r4, .LCPI7_0
+; CHECK-6M-NEXT:    ldr r5, .LCPI7_0
 ; CHECK-6M-NEXT:    adr r3, .LCPI7_1
 ; CHECK-6M-NEXT:    movs r0, #32
 ; CHECK-6M-NEXT:    cmp r1, #0
-; CHECK-6M-NEXT:    mov r5, r0
+; CHECK-6M-NEXT:    mov r4, r0
 ; CHECK-6M-NEXT:    beq .LBB7_2
 ; CHECK-6M-NEXT:  @ %bb.1:
-; CHECK-6M-NEXT:    rsbs r5, r1, #0
-; CHECK-6M-NEXT:    ands r5, r1
-; CHECK-6M-NEXT:    muls r5, r4, r5
-; CHECK-6M-NEXT:    lsrs r1, r5, #27
-; CHECK-6M-NEXT:    ldrb r5, [r3, r1]
+; CHECK-6M-NEXT:    rsbs r4, r1, #0
+; CHECK-6M-NEXT:    ands r4, r1
+; CHECK-6M-NEXT:    muls r4, r5, r4
+; CHECK-6M-NEXT:    lsrs r1, r4, #27
+; CHECK-6M-NEXT:    ldrb r4, [r3, r1]
 ; CHECK-6M-NEXT:  .LBB7_2:
-; CHECK-6M-NEXT:    adds r5, #32
+; CHECK-6M-NEXT:    adds r4, #32
 ; CHECK-6M-NEXT:    rsbs r1, r2, #0
 ; CHECK-6M-NEXT:    ands r1, r2
-; CHECK-6M-NEXT:    muls r4, r1, r4
-; CHECK-6M-NEXT:    lsrs r1, r4, #27
+; CHECK-6M-NEXT:    muls r5, r1, r5
+; CHECK-6M-NEXT:    lsrs r1, r5, #27
 ; CHECK-6M-NEXT:    cmp r2, #0
 ; CHECK-6M-NEXT:    bne .LBB7_5
 ; CHECK-6M-NEXT:  @ %bb.3:
@@ -532,7 +532,7 @@ define i64 @test_i64_zero_undef(i64 %a) {
 ; CHECK-6M-NEXT:    ldrb r0, [r3, r1]
 ; CHECK-6M-NEXT:    bne .LBB7_4
 ; CHECK-6M-NEXT:  .LBB7_6:
-; CHECK-6M-NEXT:    mov r0, r5
+; CHECK-6M-NEXT:    mov r0, r4
 ; CHECK-6M-NEXT:    movs r1, #0
 ; CHECK-6M-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-6M-NEXT:    .p2align 2
@@ -547,24 +547,24 @@ define i64 @test_i64_zero_undef(i64 %a) {
 ; CHECK-8MBASE-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-8MBASE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-8MBASE-NEXT:    mov r2, r0
-; CHECK-8MBASE-NEXT:    movw r4, #46385
-; CHECK-8MBASE-NEXT:    movt r4, #1916
+; CHECK-8MBASE-NEXT:    movw r5, #46385
+; CHECK-8MBASE-NEXT:    movt r5, #1916
 ; CHECK-8MBASE-NEXT:    adr r3, .LCPI7_0
 ; CHECK-8MBASE-NEXT:    movs r0, #32
-; CHECK-8MBASE-NEXT:    mov r5, r0
+; CHECK-8MBASE-NEXT:    mov r4, r0
 ; CHECK-8MBASE-NEXT:    cbz r1, .LBB7_2
 ; CHECK-8MBASE-NEXT:  @ %bb.1:
-; CHECK-8MBASE-NEXT:    rsbs r5, r1, #0
-; CHECK-8MBASE-NEXT:    ands r5, r1
-; CHECK-8MBASE-NEXT:    muls r5, r4, r5
-; CHECK-8MBASE-NEXT:    lsrs r1, r5, #27
-; CHECK-8MBASE-NEXT:    ldrb r5, [r3, r1]
+; CHECK-8MBASE-NEXT:    rsbs r4, r1, #0
+; CHECK-8MBASE-NEXT:    ands r4, r1
+; CHECK-8MBASE-NEXT:    muls r4, r5, r4
+; CHECK-8MBASE-NEXT:    lsrs r1, r4, #27
+; CHECK-8MBASE-NEXT:    ldrb r4, [r3, r1]
 ; CHECK-8MBASE-NEXT:  .LBB7_2:
-; CHECK-8MBASE-NEXT:    adds r5, #32
+; CHECK-8MBASE-NEXT:    adds r4, #32
 ; CHECK-8MBASE-NEXT:    rsbs r1, r2, #0
 ; CHECK-8MBASE-NEXT:    ands r1, r2
-; CHECK-8MBASE-NEXT:    muls r4, r1, r4
-; CHECK-8MBASE-NEXT:    lsrs r1, r4, #27
+; CHECK-8MBASE-NEXT:    muls r5, r1, r5
+; CHECK-8MBASE-NEXT:    lsrs r1, r5, #27
 ; CHECK-8MBASE-NEXT:    cmp r2, #0
 ; CHECK-8MBASE-NEXT:    bne .LBB7_5
 ; CHECK-8MBASE-NEXT:  @ %bb.3:
@@ -576,7 +576,7 @@ define i64 @test_i64_zero_undef(i64 %a) {
 ; CHECK-8MBASE-NEXT:    ldrb r0, [r3, r1]
 ; CHECK-8MBASE-NEXT:    bne .LBB7_4
 ; CHECK-8MBASE-NEXT:  .LBB7_6:
-; CHECK-8MBASE-NEXT:    mov r0, r5
+; CHECK-8MBASE-NEXT:    mov r0, r4
 ; CHECK-8MBASE-NEXT:    movs r1, #0
 ; CHECK-8MBASE-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-8MBASE-NEXT:    .p2align 2

diff  --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll
index 2bef1c83d7969..186276b50ceeb 100644
--- a/llvm/test/CodeGen/ARM/select-imm.ll
+++ b/llvm/test/CodeGen/ARM/select-imm.ll
@@ -455,13 +455,13 @@ define void @t9(ptr %a, i8 %b) {
 ; ARMT2-NEXT:    cmp r0, r0
 ; ARMT2-NEXT:    popne {r4, pc}
 ; ARMT2-NEXT:  .LBB8_1: @ %while.body.preheader
-; ARMT2-NEXT:    mov r1, r0
-; ARMT2-NEXT:    add r2, r4, #1
+; ARMT2-NEXT:    add r1, r4, #1
+; ARMT2-NEXT:    mov r2, r0
 ; ARMT2-NEXT:  .LBB8_2: @ %while.body
 ; ARMT2-NEXT:    @ =>This Inner Loop Header: Depth=1
-; ARMT2-NEXT:    add r1, r1, #1
 ; ARMT2-NEXT:    add r2, r2, #1
-; ARMT2-NEXT:    uxtb r3, r1
+; ARMT2-NEXT:    add r1, r1, #1
+; ARMT2-NEXT:    uxtb r3, r2
 ; ARMT2-NEXT:    cmp r3, r0
 ; ARMT2-NEXT:    blt .LBB8_2
 ; ARMT2-NEXT:  @ %bb.3: @ %while.end
@@ -503,13 +503,13 @@ define void @t9(ptr %a, i8 %b) {
 ; THUMB2-NEXT:    it ne
 ; THUMB2-NEXT:    popne {r4, pc}
 ; THUMB2-NEXT:  .LBB8_1: @ %while.body.preheader
-; THUMB2-NEXT:    mov r1, r0
-; THUMB2-NEXT:    adds r2, r4, #1
+; THUMB2-NEXT:    adds r1, r4, #1
+; THUMB2-NEXT:    mov r2, r0
 ; THUMB2-NEXT:  .LBB8_2: @ %while.body
 ; THUMB2-NEXT:    @ =>This Inner Loop Header: Depth=1
-; THUMB2-NEXT:    adds r1, #1
 ; THUMB2-NEXT:    adds r2, #1
-; THUMB2-NEXT:    uxtb r3, r1
+; THUMB2-NEXT:    adds r1, #1
+; THUMB2-NEXT:    uxtb r3, r2
 ; THUMB2-NEXT:    cmp r3, r0
 ; THUMB2-NEXT:    blt .LBB8_2
 ; THUMB2-NEXT:  @ %bb.3: @ %while.end

diff  --git a/llvm/test/CodeGen/ARM/struct-byval-loop.ll b/llvm/test/CodeGen/ARM/struct-byval-loop.ll
index a90381acf4214..7a38dec2434f7 100644
--- a/llvm/test/CodeGen/ARM/struct-byval-loop.ll
+++ b/llvm/test/CodeGen/ARM/struct-byval-loop.ll
@@ -13,13 +13,13 @@ define void @test_80() {
 ; CHECK-NEXT:    .pad #152
 ; CHECK-NEXT:    sub sp, sp, #152
 ; CHECK-NEXT:    add r0, sp, #72
-; CHECK-NEXT:    ldr r2, .LCPI0_0
+; CHECK-NEXT:    ldr r1, .LCPI0_0
 ; CHECK-NEXT:    add r0, r0, #12
-; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    mov r2, sp
 ; CHECK-NEXT:  .LBB0_1: @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r3, [r0], #4
-; CHECK-NEXT:    subs r2, r2, #4
-; CHECK-NEXT:    str r3, [r1], #4
+; CHECK-NEXT:    subs r1, r1, #4
+; CHECK-NEXT:    str r3, [r2], #4
 ; CHECK-NEXT:    bne .LBB0_1
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    add r3, sp, #72
@@ -52,14 +52,14 @@ define void @test_4000() {
 ; CHECK-NEXT:    sub sp, sp, #920
 ; CHECK-NEXT:    sub sp, sp, #3072
 ; CHECK-NEXT:    add lr, sp, #3072
-; CHECK-NEXT:    ldr r2, .LCPI1_0
+; CHECK-NEXT:    ldr r1, .LCPI1_0
 ; CHECK-NEXT:    add r0, lr, #920
-; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    mov r2, sp
 ; CHECK-NEXT:    add r0, r0, #12
 ; CHECK-NEXT:  .LBB1_1: @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r3, [r0], #4
-; CHECK-NEXT:    subs r2, r2, #4
-; CHECK-NEXT:    str r3, [r1], #4
+; CHECK-NEXT:    subs r1, r1, #4
+; CHECK-NEXT:    str r3, [r2], #4
 ; CHECK-NEXT:    bne .LBB1_1
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    ldr r1, [sp, #3992]

diff  --git a/llvm/test/CodeGen/ARM/swifterror.ll b/llvm/test/CodeGen/ARM/swifterror.ll
index 259c20c8c9af6..f002c54fc60c0 100644
--- a/llvm/test/CodeGen/ARM/swifterror.ll
+++ b/llvm/test/CodeGen/ARM/swifterror.ll
@@ -79,17 +79,17 @@ define float @caller(ptr %error_ref) {
 ;
 ; CHECK-O0-LABEL: caller:
 ; CHECK-O0:       @ %bb.0: @ %entry
-; CHECK-O0-NEXT:    push {r7, r8, lr}
-; CHECK-O0-NEXT:    add r7, sp, #4
-; CHECK-O0-NEXT:    sub sp, sp, #12
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #12
 ; CHECK-O0-NEXT:    @ implicit-def: $r1
-; CHECK-O0-NEXT:    str r0, [sp] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    bl _foo
-; CHECK-O0-NEXT:    str r8, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT:    movw r0, #0
-; CHECK-O0-NEXT:    cmp r8, r0
-; CHECK-O0-NEXT:    bne LBB1_2
+; CHECK-O0-NEXT:    str	r0, [sp]                        @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    bl	_foo
+; CHECK-O0-NEXT:    str	r8, [sp, #4]                    @ 4-byte Spill
+; CHECK-O0-NEXT:    movw	r0, #0
+; CHECK-O0-NEXT:    cmp	r8, r0
+; CHECK-O0-NEXT:    bne	LBB1_2
 ; CHECK-O0-NEXT:  @ %bb.1: @ %cont
 ; CHECK-O0-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-O0-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
@@ -100,7 +100,7 @@ define float @caller(ptr %error_ref) {
 ; CHECK-O0-NEXT:    bl _free
 ; CHECK-O0-NEXT:    mov r0, #1065353216
 ; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r7, r8, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: caller:
 ; CHECK-ANDROID:       @ %bb.0: @ %entry
@@ -174,11 +174,11 @@ define float @caller2(ptr %error_ref) {
 ;
 ; CHECK-O0-LABEL: caller2:
 ; CHECK-O0:       @ %bb.0: @ %entry
-; CHECK-O0-NEXT:    push {r7, r8, lr}
-; CHECK-O0-NEXT:    add r7, sp, #4
-; CHECK-O0-NEXT:    sub sp, sp, #16
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #16
 ; CHECK-O0-NEXT:    @ implicit-def: $r1
-; CHECK-O0-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r0, [sp, #8] @ 4-byte Spill
 ; CHECK-O0-NEXT:  LBB2_1: @ %bb_loop
 ; CHECK-O0-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-O0-NEXT:    mov r8, #0
@@ -206,7 +206,7 @@ define float @caller2(ptr %error_ref) {
 ; CHECK-O0-NEXT:    bl _free
 ; CHECK-O0-NEXT:    mov r0, #1065353216
 ; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r7, r8, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: caller2:
 ; CHECK-ANDROID:       @ %bb.0: @ %entry
@@ -400,35 +400,35 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-O0-NEXT:    mov r7, sp
 ; CHECK-O0-NEXT:    sub sp, sp, #20
 ; CHECK-O0-NEXT:    str r0, [sp, #8] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r8, [r7, #-8] @ 4-byte Spill
 ; CHECK-O0-NEXT:    vmov s0, r1
-; CHECK-O0-NEXT:    vstr s0, [r7, #-4] @ 4-byte Spill
+; CHECK-O0-NEXT:    vstr s0, [r7, #-8] @ 4-byte Spill
+; CHECK-O0-NEXT:    str r8, [r7, #-4] @ 4-byte Spill
 ; CHECK-O0-NEXT:    b LBB4_1
 ; CHECK-O0-NEXT:  LBB4_1: @ %bb_loop
 ; CHECK-O0-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-O0-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r1, [r7, #-8] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT:    cmp r0, #0
+; CHECK-O0-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr r0, [r7, #-4] @ 4-byte Reload
+; CHECK-O0-NEXT:    cmp r1, #0
+; CHECK-O0-NEXT:    str r0, [sp, #4] @ 4-byte Spill
 ; CHECK-O0-NEXT:    beq LBB4_3
 ; CHECK-O0-NEXT:  @ %bb.2: @ %gen_error
 ; CHECK-O0-NEXT:    @ in Loop: Header=BB4_1 Depth=1
 ; CHECK-O0-NEXT:    mov r0, #16
 ; CHECK-O0-NEXT:    mov r1, #0
 ; CHECK-O0-NEXT:    bl _malloc
-; CHECK-O0-NEXT:    mov r1, r0
+; CHECK-O0-NEXT:    mov r2, r0
+; CHECK-O0-NEXT:    movw r1, #1
+; CHECK-O0-NEXT:    strb r1, [r2, #8]
 ; CHECK-O0-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT:    movw r0, #1
-; CHECK-O0-NEXT:    strb r0, [r1, #8]
 ; CHECK-O0-NEXT:  LBB4_3: @ %bb_cont
 ; CHECK-O0-NEXT:    @ in Loop: Header=BB4_1 Depth=1
-; CHECK-O0-NEXT:    vldr s0, [r7, #-4] @ 4-byte Reload
+; CHECK-O0-NEXT:    vldr s0, [r7, #-8] @ 4-byte Reload
 ; CHECK-O0-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-O0-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-O0-NEXT:    vmov.f32 s2, #1.000000e+00
 ; CHECK-O0-NEXT:    vcmp.f32 s0, s2
 ; CHECK-O0-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-O0-NEXT:    str r0, [r7, #-8] @ 4-byte Spill
+; CHECK-O0-NEXT:    str r0, [r7, #-4] @ 4-byte Spill
 ; CHECK-O0-NEXT:    ble LBB4_1
 ; CHECK-O0-NEXT:  @ %bb.4: @ %bb_end
 ; CHECK-O0-NEXT:    ldr r8, [sp] @ 4-byte Reload
@@ -581,20 +581,20 @@ define float @caller3(ptr %error_ref) {
 ;
 ; CHECK-O0-LABEL: caller3:
 ; CHECK-O0:       @ %bb.0: @ %entry
-; CHECK-O0-NEXT:    push {r7, r8, lr}
-; CHECK-O0-NEXT:    add r7, sp, #4
-; CHECK-O0-NEXT:    sub sp, sp, #44
-; CHECK-O0-NEXT:    bfc sp, #0, #3
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #44
+; CHECK-O0-NEXT:    bfc	sp, #0, #3
 ; CHECK-O0-NEXT:    @ implicit-def: $r1
-; CHECK-O0-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    add r0, sp, #16
-; CHECK-O0-NEXT:    mov r1, #1
-; CHECK-O0-NEXT:    bl _foo_sret
-; CHECK-O0-NEXT:    str r8, [sp, #8] @ 4-byte Spill
-; CHECK-O0-NEXT:    movw r0, #0
-; CHECK-O0-NEXT:    cmp r8, r0
-; CHECK-O0-NEXT:    bne LBB6_2
+; CHECK-O0-NEXT:    str	r0, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    add	r0, sp, #16
+; CHECK-O0-NEXT:    mov	r1, #1
+; CHECK-O0-NEXT:    bl	_foo_sret
+; CHECK-O0-NEXT:    str	r8, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT:    movw	r0, #0
+; CHECK-O0-NEXT:    cmp	r8, r0
+; CHECK-O0-NEXT:    bne	LBB6_2
 ; CHECK-O0-NEXT:  @ %bb.1: @ %cont
 ; CHECK-O0-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
 ; CHECK-O0-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
@@ -605,7 +605,7 @@ define float @caller3(ptr %error_ref) {
 ; CHECK-O0-NEXT:    bl _free
 ; CHECK-O0-NEXT:    mov r0, #1065353216
 ; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r7, r8, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: caller3:
 ; CHECK-ANDROID:       @ %bb.0: @ %entry
@@ -803,26 +803,26 @@ define float @caller4(ptr %error_ref) {
 ;
 ; CHECK-O0-LABEL: caller4:
 ; CHECK-O0:       @ %bb.0: @ %entry
-; CHECK-O0-NEXT:    push {r7, r8, lr}
-; CHECK-O0-NEXT:    add r7, sp, #4
-; CHECK-O0-NEXT:    sub sp, sp, #24
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #24
 ; CHECK-O0-NEXT:    @ implicit-def: $r1
-; CHECK-O0-NEXT:    str r0, [sp] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    mov r0, #10
-; CHECK-O0-NEXT:    str r0, [r7, #-12]
-; CHECK-O0-NEXT:    mov r0, #11
-; CHECK-O0-NEXT:    str r0, [sp, #12]
-; CHECK-O0-NEXT:    mov r0, #12
-; CHECK-O0-NEXT:    str r0, [sp, #8]
-; CHECK-O0-NEXT:    ldr r0, [r7, #-12]
-; CHECK-O0-NEXT:    ldr r1, [sp, #12]
-; CHECK-O0-NEXT:    ldr r2, [sp, #8]
-; CHECK-O0-NEXT:    bl _foo_vararg
-; CHECK-O0-NEXT:    str r8, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT:    movw r0, #0
-; CHECK-O0-NEXT:    cmp r8, r0
-; CHECK-O0-NEXT:    bne LBB8_2
+; CHECK-O0-NEXT:    str	r0, [sp] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    mov	r0, #10
+; CHECK-O0-NEXT:    str	r0, [r7, #-12]
+; CHECK-O0-NEXT:    mov	r0, #11
+; CHECK-O0-NEXT:    str	r0, [sp, #12]
+; CHECK-O0-NEXT:    mov	r0, #12
+; CHECK-O0-NEXT:    str	r0, [sp, #8]
+; CHECK-O0-NEXT:    ldr	r0, [r7, #-12]
+; CHECK-O0-NEXT:    ldr	r1, [sp, #12]
+; CHECK-O0-NEXT:    ldr	r2, [sp, #8]
+; CHECK-O0-NEXT:    bl	_foo_vararg
+; CHECK-O0-NEXT:    str	r8, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT:    movw	r0, #0
+; CHECK-O0-NEXT:    cmp	r8, r0
+; CHECK-O0-NEXT:    bne	LBB8_2
 ; CHECK-O0-NEXT:  @ %bb.1: @ %cont
 ; CHECK-O0-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-O0-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
@@ -833,7 +833,7 @@ define float @caller4(ptr %error_ref) {
 ; CHECK-O0-NEXT:    bl _free
 ; CHECK-O0-NEXT:    mov r0, #1065353216
 ; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r7, r8, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: caller4:
 ; CHECK-ANDROID:       @ %bb.0: @ %entry
@@ -987,12 +987,12 @@ define swiftcc void @swifterror_reg_clobber(ptr nocapture %err) {
 ;
 ; CHECK-O0-LABEL: swifterror_reg_clobber:
 ; CHECK-O0:       @ %bb.0:
-; CHECK-O0-NEXT:    push {r7, r8, lr}
-; CHECK-O0-NEXT:    add r7, sp, #4
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
 ; CHECK-O0-NEXT:    @ InlineAsm Start
 ; CHECK-O0-NEXT:    nop
 ; CHECK-O0-NEXT:    @ InlineAsm End
-; CHECK-O0-NEXT:    pop {r7, r8, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: swifterror_reg_clobber:
 ; CHECK-ANDROID:       @ %bb.0:
@@ -1038,34 +1038,34 @@ define swiftcc void @params_in_reg(i32, i32, i32, i32, ptr swiftself, ptr nocapt
 ;
 ; CHECK-O0-LABEL: params_in_reg:
 ; CHECK-O0:       @ %bb.0:
-; CHECK-O0-NEXT:    push {r7, r10, lr}
-; CHECK-O0-NEXT:    add r7, sp, #4
-; CHECK-O0-NEXT:    sub sp, sp, #28
-; CHECK-O0-NEXT:    bfc sp, #0, #3
-; CHECK-O0-NEXT:    str r8, [sp, #20] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r10, [sp] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r3, [sp, #16] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r2, [sp, #12] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT:    push	{r7, r10, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #28
+; CHECK-O0-NEXT:    bfc	sp, #0, #3
+; CHECK-O0-NEXT:    str	r8, [sp, #20] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r10, [sp] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r3, [sp, #16] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r2, [sp, #12] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r1, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r0, [sp, #4] @ 4-byte Spill
 ; CHECK-O0-NEXT:    @ implicit-def: $r0
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    mov r0, #1
-; CHECK-O0-NEXT:    mov r1, #2
-; CHECK-O0-NEXT:    mov r2, #3
-; CHECK-O0-NEXT:    mov r3, #4
-; CHECK-O0-NEXT:    mov r10, r8
-; CHECK-O0-NEXT:    bl _params_in_reg2
-; CHECK-O0-NEXT:    ldr r10, [sp] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
-; CHECK-O0-NEXT:    mov r9, r8
-; CHECK-O0-NEXT:    ldr r8, [sp, #20] @ 4-byte Reload
-; CHECK-O0-NEXT:    bl _params_in_reg2
-; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r7, r10, pc}
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    mov	r0, #1
+; CHECK-O0-NEXT:    mov	r1, #2
+; CHECK-O0-NEXT:    mov	r2, #3
+; CHECK-O0-NEXT:    mov	r3, #4
+; CHECK-O0-NEXT:    mov	r10, r8
+; CHECK-O0-NEXT:    bl	_params_in_reg2
+; CHECK-O0-NEXT:    ldr	r10, [sp] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r0, [sp, #4] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r1, [sp, #8] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r2, [sp, #12] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r3, [sp, #16] @ 4-byte Reload
+; CHECK-O0-NEXT:    mov	r9, r8
+; CHECK-O0-NEXT:    ldr	r8, [sp, #20] @ 4-byte Reload
+; CHECK-O0-NEXT:    bl	_params_in_reg2
+; CHECK-O0-NEXT:    sub	sp, r7, #4
+; CHECK-O0-NEXT:    pop	{r7, r10, pc}
 ;
 ; CHECK-ANDROID-LABEL: params_in_reg:
 ; CHECK-ANDROID:       @ %bb.0:
@@ -1153,63 +1153,63 @@ define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i3
 ;
 ; CHECK-O0-LABEL: params_and_return_in_reg:
 ; CHECK-O0:       @ %bb.0:
-; CHECK-O0-NEXT:    push {r7, r10, lr}
-; CHECK-O0-NEXT:    add r7, sp, #4
-; CHECK-O0-NEXT:    sub sp, sp, #76
-; CHECK-O0-NEXT:    bfc sp, #0, #3
-; CHECK-O0-NEXT:    str r8, [sp, #24] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r10, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r3, [sp, #20] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r2, [sp, #16] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r1, [sp, #12] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT:    push	{r7, r10, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #76
+; CHECK-O0-NEXT:    bfc	sp, #0, #3
+; CHECK-O0-NEXT:    str	r8, [sp, #24] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r10, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r3, [sp, #20] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r2, [sp, #16] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r1, [sp, #12] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r0, [sp, #8] @ 4-byte Spill
 ; CHECK-O0-NEXT:    @ implicit-def: $r0
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    str r8, [sp, #28] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r0, #1
-; CHECK-O0-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r1, #2
-; CHECK-O0-NEXT:    str r1, [sp, #36] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r2, #3
-; CHECK-O0-NEXT:    str r2, [sp, #40] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r3, #4
-; CHECK-O0-NEXT:    str r3, [sp, #44] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r10, r8
-; CHECK-O0-NEXT:    bl _params_in_reg2
-; CHECK-O0-NEXT:    ldr r10, [sp, #4] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
-; CHECK-O0-NEXT:    mov r9, r8
-; CHECK-O0-NEXT:    ldr r8, [sp, #24] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #48] @ 4-byte Spill
-; CHECK-O0-NEXT:    bl _params_and_return_in_reg2
-; CHECK-O0-NEXT:    ldr r10, [sp, #28] @ 4-byte Reload
-; CHECK-O0-NEXT:    mov r9, r0
-; CHECK-O0-NEXT:    ldr r0, [sp, #32] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #52] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r9, r1
-; CHECK-O0-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #56] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r9, r2
-; CHECK-O0-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #60] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r9, r3
-; CHECK-O0-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #64] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r9, r8
-; CHECK-O0-NEXT:    ldr r8, [sp, #48] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #68] @ 4-byte Spill
-; CHECK-O0-NEXT:    bl _params_in_reg2
-; CHECK-O0-NEXT:    ldr r0, [sp, #52] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r2, [sp, #60] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r3, [sp, #64] @ 4-byte Reload
-; CHECK-O0-NEXT:    mov r9, r8
-; CHECK-O0-NEXT:    ldr r8, [sp, #68] @ 4-byte Reload
-; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r7, r10, pc}
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    str	r8, [sp, #28] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r0, #1
+; CHECK-O0-NEXT:    str	r0, [sp, #32] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r1, #2
+; CHECK-O0-NEXT:    str	r1, [sp, #36] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r2, #3
+; CHECK-O0-NEXT:    str	r2, [sp, #40] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r3, #4
+; CHECK-O0-NEXT:    str	r3, [sp, #44] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r10, r8
+; CHECK-O0-NEXT:    bl	_params_in_reg2
+; CHECK-O0-NEXT:    ldr	r10, [sp, #4] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r0, [sp, #8] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r1, [sp, #12] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r2, [sp, #16] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r3, [sp, #20] @ 4-byte Reload
+; CHECK-O0-NEXT:    mov	r9, r8
+; CHECK-O0-NEXT:    ldr	r8, [sp, #24] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #48] @ 4-byte Spill
+; CHECK-O0-NEXT:    bl	_params_and_return_in_reg2
+; CHECK-O0-NEXT:    ldr	r10, [sp, #28] @ 4-byte Reload
+; CHECK-O0-NEXT:    mov	r9, r0
+; CHECK-O0-NEXT:    ldr	r0, [sp, #32] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #52] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r9, r1
+; CHECK-O0-NEXT:    ldr	r1, [sp, #36] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #56] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r9, r2
+; CHECK-O0-NEXT:    ldr	r2, [sp, #40] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #60] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r9, r3
+; CHECK-O0-NEXT:    ldr	r3, [sp, #44] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #64] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r9, r8
+; CHECK-O0-NEXT:    ldr	r8, [sp, #48] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #68] @ 4-byte Spill
+; CHECK-O0-NEXT:    bl	_params_in_reg2
+; CHECK-O0-NEXT:    ldr	r0, [sp, #52] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r1, [sp, #56] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r2, [sp, #60] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r3, [sp, #64] @ 4-byte Reload
+; CHECK-O0-NEXT:    mov	r9, r8
+; CHECK-O0-NEXT:    ldr	r8, [sp, #68] @ 4-byte Reload
+; CHECK-O0-NEXT:    sub	sp, r7, #4
+; CHECK-O0-NEXT:    pop	{r7, r10, pc}
 ;
 ; CHECK-ANDROID-LABEL: params_and_return_in_reg:
 ; CHECK-ANDROID:       @ %bb.0:
@@ -1325,17 +1325,17 @@ define swiftcc ptr @testAssign(ptr %error_ref) {
 ;
 ; CHECK-O0-LABEL: testAssign:
 ; CHECK-O0:       @ %bb.0: @ %entry
-; CHECK-O0-NEXT:    push {r7, r8, lr}
-; CHECK-O0-NEXT:    add r7, sp, #4
-; CHECK-O0-NEXT:    sub sp, sp, #8
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #8
 ; CHECK-O0-NEXT:    @ implicit-def: $r1
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    bl _foo2
-; CHECK-O0-NEXT:    str r8, [sp] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    bl	_foo2
+; CHECK-O0-NEXT:    str	r8, [sp] @ 4-byte Spill
 ; CHECK-O0-NEXT:  @ %bb.1: @ %a
 ; CHECK-O0-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r7, r8, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: testAssign:
 ; CHECK-ANDROID:       @ %bb.0: @ %entry

diff  --git a/llvm/test/CodeGen/AVR/bug-81911.ll b/llvm/test/CodeGen/AVR/bug-81911.ll
index d3436e2da1d3d..2a22666a1ff92 100644
--- a/llvm/test/CodeGen/AVR/bug-81911.ll
+++ b/llvm/test/CodeGen/AVR/bug-81911.ll
@@ -41,31 +41,31 @@ define internal i8 @main() {
 ; CHECK-NEXT:    adiw r24, 6
 ; CHECK-NEXT:    std Y+3, r25 ; 2-byte Folded Spill
 ; CHECK-NEXT:    std Y+2, r24 ; 2-byte Folded Spill
-; CHECK-NEXT:    movw r2, r16
-; CHECK-NEXT:    movw r4, r16
-; CHECK-NEXT:    movw r6, r16
 ; CHECK-NEXT:    movw r8, r16
+; CHECK-NEXT:    movw r6, r16
+; CHECK-NEXT:    movw r4, r16
+; CHECK-NEXT:    movw r2, r16
 ; CHECK-NEXT:    rjmp .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: ; %bb1
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    andi r30, 1
 ; CHECK-NEXT:    ldd r31, Y+4 ; 1-byte Folded Reload
 ; CHECK-NEXT:    dec r31
-; CHECK-NEXT:    movw r8, r24
-; CHECK-NEXT:    movw r6, r22
-; CHECK-NEXT:    movw r4, r20
-; CHECK-NEXT:    movw r2, r18
 ; CHECK-NEXT:    cpi r30, 0
+; CHECK-NEXT:    movw r8, r18
+; CHECK-NEXT:    movw r6, r20
+; CHECK-NEXT:    movw r4, r22
+; CHECK-NEXT:    movw r2, r24
 ; CHECK-NEXT:    mov r18, r31
 ; CHECK-NEXT:    brne .LBB0_2
 ; CHECK-NEXT:    rjmp .LBB0_4
 ; CHECK-NEXT:  .LBB0_2: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    std Y+4, r18 ; 1-byte Folded Spill
-; CHECK-NEXT:    movw r18, r2
-; CHECK-NEXT:    movw r20, r4
-; CHECK-NEXT:    movw r22, r6
-; CHECK-NEXT:    movw r24, r8
+; CHECK-NEXT:    movw r18, r8
+; CHECK-NEXT:    movw r20, r6
+; CHECK-NEXT:    movw r22, r4
+; CHECK-NEXT:    movw r24, r2
 ; CHECK-NEXT:    ldi r26, 10
 ; CHECK-NEXT:    ldi r27, 0
 ; CHECK-NEXT:    movw r10, r26
@@ -85,14 +85,14 @@ define internal i8 @main() {
 ; CHECK-NEXT:    ;APP
 ; CHECK-NEXT:    ;NO_APP
 ; CHECK-NEXT:    ldi r30, 1
-; CHECK-NEXT:    cp r2, r1
-; CHECK-NEXT:    cpc r3, r1
-; CHECK-NEXT:    cpc r4, r16
-; CHECK-NEXT:    cpc r5, r17
+; CHECK-NEXT:    cp r8, r1
+; CHECK-NEXT:    cpc r9, r1
 ; CHECK-NEXT:    cpc r6, r16
 ; CHECK-NEXT:    cpc r7, r17
-; CHECK-NEXT:    cpc r8, r16
-; CHECK-NEXT:    cpc r9, r17
+; CHECK-NEXT:    cpc r4, r16
+; CHECK-NEXT:    cpc r5, r17
+; CHECK-NEXT:    cpc r2, r16
+; CHECK-NEXT:    cpc r3, r17
 ; CHECK-NEXT:    breq .LBB0_3
 ; CHECK-NEXT:    rjmp .LBB0_1
 ; CHECK-NEXT:  .LBB0_3: ; %bb1

diff  --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
index 4d75a733526b0..006a8b6bfc94a 100644
--- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
 
 ; This version of the conv3x3 test has both loops. This test checks that the
-; inner loop has 13 packets.
+; inner loop has 14 packets.
 
 ; CHECK: loop0(.LBB0_[[LOOP:.]],
 ; CHECK: .LBB0_[[LOOP]]:
@@ -17,6 +17,7 @@
 ; CHECK: }
 ; CHECK: }
 ; CHECK: }
+; CHECK: }
 ; CHECK-NOT: }
 ; CHECK: }{{[ \t]*}}:endloop0
 

diff  --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
index b50290525002d..96a38939dc50e 100644
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
@@ -12,8 +12,8 @@
 ; CHECK: [[EPLOG]]:
 ; CHECK: [[VREG1:v([0-9]+)]] = [[VREG]]
 ; CHECK: [[EPLOG1]]:
-; CHECK: [[VREG2:v[0-9]+]] = [[VREG]]
-; CHECK: = vlalign([[VREG2]],[[VREG1]],#1)
+; CHECK: [[VREG2:v[0-9]+]] = [[VREG1]]
+; CHECK: = vlalign([[VREG1]],[[VREG2]],#1)
 
 ; Function Attrs: nounwind
 define void @f0(ptr noalias nocapture readonly %a0, i32 %a1, i32 %a2, ptr noalias nocapture readonly %a3, i32 %a4, ptr noalias nocapture %a5, i32 %a6) #0 {

diff  --git a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
index c4dbbcc5969ca..42efe60b96d48 100644
--- a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
@@ -3,7 +3,7 @@
 ; From coremark. Test that we pipeline the matrix multiplication bitextract
 ; function. The pipelined code should have two packets.
 
-; CHECK: loop0(.LBB0_[[LOOP:[0-9]+]],
+; CHECK: loop0(.LBB0_[[LOOP:.]],
 ; CHECK: .LBB0_[[LOOP]]:
 ; CHECK: [[REG0:(r[0-9]+)]] = mpyi([[REG1:(r[0-9]+)]],[[REG2:(r[0-9]+)]])
 ; CHECK: += mpyi

diff  --git a/llvm/test/CodeGen/Hexagon/swp-stages4.ll b/llvm/test/CodeGen/Hexagon/swp-stages4.ll
index bddf9cebe7160..0d029dc7d2f2e 100644
--- a/llvm/test/CodeGen/Hexagon/swp-stages4.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-stages4.ll
@@ -3,8 +3,11 @@
 ; Test that we rename registers correctly for multiple stages when there is a
 ; Phi and depends upon another Phi.
 
-; CHECK: jump
-; CHECK-NEXT: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255)
+; CHECK: = and
+; CHECK: = and
+; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1)
+; CHECK: = and
+; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255)
 ; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255)
 ; CHECK: loop0(.LBB0_[[LOOP:.]],
 ; CHECK: .LBB0_[[LOOP]]:

diff  --git a/llvm/test/CodeGen/Hexagon/tinycore.ll b/llvm/test/CodeGen/Hexagon/tinycore.ll
index b20a7831df4d8..c44038e767194 100644
--- a/llvm/test/CodeGen/Hexagon/tinycore.ll
+++ b/llvm/test/CodeGen/Hexagon/tinycore.ll
@@ -8,15 +8,10 @@
 ; CHECK: .LBB0_[[LOOP]]:
 ; CHECK: {
 ; CHECK-NEXT: mpy
-; CHECK-NOT: memw
-; CHECK: }
-; CHECK: {
-; CHECK: memw
-; CHECK-NOT: memw
-; CHECK: }
-; CHECK: {
+; CHECK-NEXT: combine
+; CHECK-NEXT: memw
+; CHECK-NEXT: }
 ; CHECK: memw
-; CHECK-NOT: memw
 ; CHECK: } :endloop0
 
 ; Test the loop contains a single packet with 4 instructions.

diff  --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index 06edb736e0435..9142e718e8adc 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -39,17 +39,17 @@ define void @test_la_pcrel(i32 signext %n) {
 ;
 ; LA64LARGE-LABEL: test_la_pcrel:
 ; LA64LARGE:       # %bb.0: # %entry
-; LA64LARGE-NEXT:    move $a1, $zero
-; LA64LARGE-NEXT:    pcalau12i $a2, %pc_hi20(l)
-; LA64LARGE-NEXT:    addi.d $a3, $zero, %pc_lo12(l)
-; LA64LARGE-NEXT:    lu32i.d $a3, %pc64_lo20(l)
-; LA64LARGE-NEXT:    lu52i.d $a3, $a3, %pc64_hi12(l)
+; LA64LARGE-NEXT:    pcalau12i $a1, %pc_hi20(l)
+; LA64LARGE-NEXT:    addi.d $a2, $zero, %pc_lo12(l)
+; LA64LARGE-NEXT:    lu32i.d $a2, %pc64_lo20(l)
+; LA64LARGE-NEXT:    lu52i.d $a2, $a2, %pc64_hi12(l)
+; LA64LARGE-NEXT:    move $a3, $zero
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB0_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT:    ldx.w $zero, $a3, $a2
-; LA64LARGE-NEXT:    addi.w $a1, $a1, 1
-; LA64LARGE-NEXT:    blt $a1, $a0, .LBB0_1
+; LA64LARGE-NEXT:    ldx.w $zero, $a2, $a1
+; LA64LARGE-NEXT:    addi.w $a3, $a3, 1
+; LA64LARGE-NEXT:    blt $a3, $a0, .LBB0_1
 ; LA64LARGE-NEXT:  # %bb.2: # %ret
 ; LA64LARGE-NEXT:    ret
 entry:
@@ -99,18 +99,18 @@ define void @test_la_got(i32 signext %n) {
 ;
 ; LA64LARGE-LABEL: test_la_got:
 ; LA64LARGE:       # %bb.0: # %entry
-; LA64LARGE-NEXT:    move $a1, $zero
-; LA64LARGE-NEXT:    pcalau12i $a2, %got_pc_hi20(g)
-; LA64LARGE-NEXT:    addi.d $a3, $zero, %got_pc_lo12(g)
-; LA64LARGE-NEXT:    lu32i.d $a3, %got64_pc_lo20(g)
-; LA64LARGE-NEXT:    lu52i.d $a3, $a3, %got64_pc_hi12(g)
-; LA64LARGE-NEXT:    ldx.d $a2, $a3, $a2
+; LA64LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(g)
+; LA64LARGE-NEXT:    addi.d $a2, $zero, %got_pc_lo12(g)
+; LA64LARGE-NEXT:    lu32i.d $a2, %got64_pc_lo20(g)
+; LA64LARGE-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(g)
+; LA64LARGE-NEXT:    ldx.d $a1, $a2, $a1
+; LA64LARGE-NEXT:    move $a2, $zero
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB1_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT:    ld.w $zero, $a2, 0
-; LA64LARGE-NEXT:    addi.w $a1, $a1, 1
-; LA64LARGE-NEXT:    blt $a1, $a0, .LBB1_1
+; LA64LARGE-NEXT:    ld.w $zero, $a1, 0
+; LA64LARGE-NEXT:    addi.w $a2, $a2, 1
+; LA64LARGE-NEXT:    blt $a2, $a0, .LBB1_1
 ; LA64LARGE-NEXT:  # %bb.2: # %ret
 ; LA64LARGE-NEXT:    ret
 entry:
@@ -161,18 +161,18 @@ define void @test_la_tls_ie(i32 signext %n) {
 ;
 ; LA64LARGE-LABEL: test_la_tls_ie:
 ; LA64LARGE:       # %bb.0: # %entry
-; LA64LARGE-NEXT:    move $a1, $zero
-; LA64LARGE-NEXT:    pcalau12i $a2, %ie_pc_hi20(ie)
-; LA64LARGE-NEXT:    addi.d $a3, $zero, %ie_pc_lo12(ie)
-; LA64LARGE-NEXT:    lu32i.d $a3, %ie64_pc_lo20(ie)
-; LA64LARGE-NEXT:    lu52i.d $a3, $a3, %ie64_pc_hi12(ie)
-; LA64LARGE-NEXT:    ldx.d $a2, $a3, $a2
+; LA64LARGE-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
+; LA64LARGE-NEXT:    addi.d $a2, $zero, %ie_pc_lo12(ie)
+; LA64LARGE-NEXT:    lu32i.d $a2, %ie64_pc_lo20(ie)
+; LA64LARGE-NEXT:    lu52i.d $a2, $a2, %ie64_pc_hi12(ie)
+; LA64LARGE-NEXT:    ldx.d $a1, $a2, $a1
+; LA64LARGE-NEXT:    move $a2, $zero
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB2_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT:    ldx.w $zero, $a2, $tp
-; LA64LARGE-NEXT:    addi.w $a1, $a1, 1
-; LA64LARGE-NEXT:    blt $a1, $a0, .LBB2_1
+; LA64LARGE-NEXT:    ldx.w $zero, $a1, $tp
+; LA64LARGE-NEXT:    addi.w $a2, $a2, 1
+; LA64LARGE-NEXT:    blt $a2, $a0, .LBB2_1
 ; LA64LARGE-NEXT:  # %bb.2: # %ret
 ; LA64LARGE-NEXT:    ret
 entry:
@@ -270,11 +270,11 @@ define void @test_la_tls_ld(i32 signext %n) {
 ; LA64LARGE-NEXT:    .cfi_offset 23, -24
 ; LA64LARGE-NEXT:    .cfi_offset 24, -32
 ; LA64LARGE-NEXT:    move $fp, $a0
-; LA64LARGE-NEXT:    move $s1, $zero
 ; LA64LARGE-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
 ; LA64LARGE-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
 ; LA64LARGE-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
 ; LA64LARGE-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
+; LA64LARGE-NEXT:    move $s1, $zero
 ; LA64LARGE-NEXT:    add.d $s0, $a1, $a0
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB3_1: # %loop
@@ -436,11 +436,11 @@ define void @test_la_tls_gd(i32 signext %n) nounwind {
 ; LA64LARGE-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    st.d $s1, $sp, 0 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    move $fp, $a0
-; LA64LARGE-NEXT:    move $s1, $zero
 ; LA64LARGE-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
 ; LA64LARGE-NEXT:    addi.d $a1, $zero, %got_pc_lo12(gd)
 ; LA64LARGE-NEXT:    lu32i.d $a1, %got64_pc_lo20(gd)
 ; LA64LARGE-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(gd)
+; LA64LARGE-NEXT:    move $s1, $zero
 ; LA64LARGE-NEXT:    add.d $s0, $a1, $a0
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB5_1: # %loop

diff  --git a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
index 59f1477b5c37f..8bdf719f4bb5b 100644
--- a/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
+++ b/llvm/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.mir
@@ -80,9 +80,10 @@ body:             |
 # CHECK-NEXT:    %15:g8rc = COPY killed %6
 # CHECK:       bb.3:
 # CHECK:         %10:g8rc = COPY killed %15
-# CHECK-NEXT:    %16:g8rc_and_g8rc_nox0 = COPY killed %14
+# CHECK-NEXT:    %9:g8rc = COPY killed %14
 # CHECK-NEXT:    %14:g8rc = COPY killed %10
 # CHECK-NEXT:    %15:g8rc = IMPLICIT_DEF
+# CHECK-NEXT:    %16:g8rc_and_g8rc_nox0 = COPY killed %9
 # CHECK-NEXT:    BCC 68, %7, %bb.3
 # CHECK-NEXT:    B %bb.4
 # CHECK:       bb.4:

diff  --git a/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll b/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll
index cd2fbdfe71263..cd5ea16d4600b 100644
--- a/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll
+++ b/llvm/test/CodeGen/PowerPC/disable-ctr-ppcf128.ll
@@ -51,9 +51,9 @@ define ppc_fp128 @test_ctr0() {
 ; P9BE-NEXT:    .cfi_offset r30, -16
 ; P9BE-NEXT:    li r3, 1
 ; P9BE-NEXT:    std r30, 112(r1) # 8-byte Folded Spill
-; P9BE-NEXT:    xxlxor f2, f2, f2
-; P9BE-NEXT:    rldic r30, r3, 62, 1
 ; P9BE-NEXT:    xxlxor f1, f1, f1
+; P9BE-NEXT:    rldic r30, r3, 62, 1
+; P9BE-NEXT:    xxlxor f2, f2, f2
 ; P9BE-NEXT:    .p2align 5
 ; P9BE-NEXT:  .LBB0_1: # %bb6
 ; P9BE-NEXT:    #
@@ -111,8 +111,8 @@ define ppc_fp128 @test_ctr0() {
 ; P8BE-NEXT:    .cfi_offset r30, -16
 ; P8BE-NEXT:    li r3, 1
 ; P8BE-NEXT:    std r30, 112(r1) # 8-byte Folded Spill
-; P8BE-NEXT:    xxlxor f2, f2, f2
 ; P8BE-NEXT:    xxlxor f1, f1, f1
+; P8BE-NEXT:    xxlxor f2, f2, f2
 ; P8BE-NEXT:    rldic r30, r3, 62, 1
 ; P8BE-NEXT:    .p2align 5
 ; P8BE-NEXT:  .LBB0_1: # %bb6

diff  --git a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir
index a4b18e648b7a2..72f778286abe4 100644
--- a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir
+++ b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir
@@ -195,9 +195,12 @@ body:             |
   ; CHECK: bb.4:
   ; CHECK:   successors: %bb.5(0x80000000)
   ; CHECK:   %44:g8rc_and_g8rc_nox0 = COPY killed %59
-  ; CHECK:   %63:gprc = COPY killed %57
-  ; CHECK:   %61:gprc = COPY killed %60
-  ; CHECK:   %62:g8rc_and_g8rc_nox0 = COPY killed %44
+  ; CHECK:   %43:gprc = COPY killed %57
+  ; CHECK:   %41:gprc = COPY killed %60
+  ; CHECK:   %39:g8rc = COPY killed %44
+  ; CHECK:   %61:gprc = COPY killed %41
+  ; CHECK:   %62:g8rc_and_g8rc_nox0 = COPY killed %39
+  ; CHECK:   %63:gprc = COPY killed %43
 
   ; CHECK: bb.5:
   ; CHECK:   successors: %bb.6(0x80000000)

diff  --git a/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir b/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir
index 4cad98eeade77..474c288bba88b 100644
--- a/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir
+++ b/llvm/test/CodeGen/PowerPC/ppcf128-freeze.mir
@@ -1,10 +1,21 @@
-# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple powerpc64le-unknown-linux-gnu -start-after=codegenprepare \
 # RUN:   -o - %s -verify-machineinstrs | FileCheck %s
 
 --- |
   define ppc_fp128 @freeze_select(ppc_fp128 %a, ppc_fp128 %b) {
-  ; CHECK-LABEL: freeze_select:
+    %sel.frozen = freeze ppc_fp128 %a
+    %cmp = fcmp one ppc_fp128 %sel.frozen, 0xM00000000000000000000000000000000
+    br i1 %cmp, label %select.end, label %select.false
+
+  select.false:                                     ; preds = %0
+    br label %select.end
+
+  select.end:                                       ; preds = %0, %select.false
+    %sel = phi ppc_fp128 [ %a, %0 ], [ %b, %select.false ]
+    ret ppc_fp128 %sel
+  }
+
+  ; CHECK-LABEL: freeze_select
   ; CHECK:       # %bb.0:
   ; CHECK-NEXT:    xxlxor 0, 0, 0
   ; CHECK-NEXT:    fcmpu 1, 2, 2
@@ -17,19 +28,8 @@
   ; CHECK-NEXT:  # %bb.1:
   ; CHECK-NEXT:    crnor 20, 7, 2
   ; CHECK-NEXT:    bclr 12, 20, 0
-  ; CHECK-NEXT:  # %bb.2: # %select.false
-  ; CHECK-NEXT:    fmr 2, 4
+  ; CHECK-NEXT:  # %bb.2:                                # %select.false
   ; CHECK-NEXT:    fmr 1, 3
+  ; CHECK-NEXT:    fmr 2, 4
   ; CHECK-NEXT:    blr
-    %sel.frozen = freeze ppc_fp128 %a
-    %cmp = fcmp one ppc_fp128 %sel.frozen, 0xM00000000000000000000000000000000
-    br i1 %cmp, label %select.end, label %select.false
-
-  select.false:                                     ; preds = %0
-    br label %select.end
-
-  select.end:                                       ; preds = %0, %select.false
-    %sel = phi ppc_fp128 [ %a, %0 ], [ %b, %select.false ]
-    ret ppc_fp128 %sel
-  }
 ...

diff  --git a/llvm/test/CodeGen/PowerPC/pr116071.ll b/llvm/test/CodeGen/PowerPC/pr116071.ll
index 5db84436c22f6..29f11fc1d3a63 100644
--- a/llvm/test/CodeGen/PowerPC/pr116071.ll
+++ b/llvm/test/CodeGen/PowerPC/pr116071.ll
@@ -1,26 +1,9 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -disable-ppc-vsx-fma-mutation=false -mcpu=pwr10 -verify-machineinstrs \
-; RUN:   -ppc-asm-full-reg-names -mtriple powerpc64-ibm-aix7.2.0.0 < %s | FileCheck %s
+; RUN:   -ppc-asm-full-reg-names -mtriple powerpc64-ibm-aix7.2.0.0 < %s | FileCheck %s 
 
 target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512"
 
 define void @initial(<2 x double> %0){
-; CHECK-LABEL: initial:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xxlxor vs0, vs0, vs0
-; CHECK-NEXT:    xxlxor f2, f2, f2
-; CHECK-NEXT:    xxlxor f4, f4, f4
-; CHECK-NEXT:    xxlxor f3, f3, f3
-; CHECK-NEXT:    xvmuldp vs1, vs34, vs0
-; CHECK-NEXT:    .align 5
-; CHECK-NEXT:  L..BB0_1: # %for.cond251.preheader.lr.ph
-; CHECK-NEXT:    #
-; CHECK-NEXT:    fmr f5, f3
-; CHECK-NEXT:    xsadddp f3, f3, f4
-; CHECK-NEXT:    fmr f4, f5
-; CHECK-NEXT:    xxmrghd vs3, vs3, vs2
-; CHECK-NEXT:    xvmaddmdp vs3, vs0, vs1
-; CHECK-NEXT:    b L..BB0_1
 entry:
   %1 = fmul <2 x double> %0, zeroinitializer
   br label %for.cond251.preheader.lr.ph
@@ -35,3 +18,9 @@ for.cond251.preheader.lr.ph:                      ; preds = %for.cond251.prehead
   %7 = extractelement <2 x double> %6, i64 0
   br label %for.cond251.preheader.lr.ph
 }
+
+; CHECK:        xsadddp f4, f3, f4
+; CHECK-NEXT:   xxmrghd vs5, vs4, vs2
+; CHECK-NEXT:   fmr f4, f3
+; CHECK-NEXT:   xvmaddmdp vs5, vs0, vs1
+; CHECK-NEXT:   fmr f3, f5

diff  --git a/llvm/test/CodeGen/PowerPC/sms-phi-2.ll b/llvm/test/CodeGen/PowerPC/sms-phi-2.ll
index 0077673292ab3..4904d11fc8104 100644
--- a/llvm/test/CodeGen/PowerPC/sms-phi-2.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-phi-2.ll
@@ -9,7 +9,7 @@ define void @phi2(i32, i32, ptr) local_unnamed_addr {
 ; CHECK-NEXT:    li 5, 55
 ; CHECK-NEXT:    li 6, 48
 ; CHECK-NEXT:    mtctr 3
-; CHECK-NEXT:    bdz .LBB0_3
+; CHECK-NEXT:    bdz .LBB0_4
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    divw 9, 8, 4
 ; CHECK-NEXT:    mullw 7, 8, 4
@@ -19,7 +19,7 @@ define void @phi2(i32, i32, ptr) local_unnamed_addr {
 ; CHECK-NEXT:    add 3, 7, 3
 ; CHECK-NEXT:    stbu 3, -1(7)
 ; CHECK-NEXT:    mr 3, 8
-; CHECK-NEXT:    bdz .LBB0_4
+; CHECK-NEXT:    bdz .LBB0_3
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    mr 3, 9
@@ -33,12 +33,13 @@ define void @phi2(i32, i32, ptr) local_unnamed_addr {
 ; CHECK-NEXT:    stbu 8, -1(7)
 ; CHECK-NEXT:    mr 8, 3
 ; CHECK-NEXT:    bdnz .LBB0_2
-; CHECK-NEXT:    b .LBB0_4
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    # implicit-def: $x7
-; CHECK-NEXT:    mr 9, 8
+; CHECK-NEXT:    mr 8, 9
+; CHECK-NEXT:    b .LBB0_5
 ; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    mullw 4, 9, 4
+; CHECK-NEXT:    # implicit-def: $x7
+; CHECK-NEXT:  .LBB0_5:
+; CHECK-NEXT:    mullw 4, 8, 4
 ; CHECK-NEXT:    sub 3, 3, 4
 ; CHECK-NEXT:    cmplwi 3, 10
 ; CHECK-NEXT:    isellt 4, 6, 5

diff  --git a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll
index 2dd8b36389c62..628822edabf39 100644
--- a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll
@@ -19,34 +19,34 @@ define void @phi3(ptr) nounwind {
 ; CHECK-NEXT:    mr 29, 3
 ; CHECK-NEXT:    bl malloc
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    addi 6, 30, -4
+; CHECK-NEXT:    addi 7, 30, -4
 ; CHECK-NEXT:    mtctr 3
 ; CHECK-NEXT:    addi 4, 29, -8
 ; CHECK-NEXT:    li 5, 0
-; CHECK-NEXT:    lwzu 8, 4(6)
+; CHECK-NEXT:    lwzu 8, 4(7)
 ; CHECK-NEXT:    bdz .LBB0_5
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    extswsli 7, 5, 5
+; CHECK-NEXT:    extswsli 6, 5, 5
 ; CHECK-NEXT:    add 5, 8, 5
-; CHECK-NEXT:    lwzu 8, 4(6)
+; CHECK-NEXT:    lwzu 8, 4(7)
 ; CHECK-NEXT:    bdz .LBB0_4
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    add 7, 3, 7
-; CHECK-NEXT:    stdu 7, 8(4)
-; CHECK-NEXT:    extswsli 7, 5, 5
+; CHECK-NEXT:    add 6, 3, 6
+; CHECK-NEXT:    stdu 6, 8(4)
+; CHECK-NEXT:    extswsli 6, 5, 5
 ; CHECK-NEXT:    add 5, 8, 5
-; CHECK-NEXT:    lwzu 8, 4(6)
+; CHECK-NEXT:    lwzu 8, 4(7)
 ; CHECK-NEXT:    bdz .LBB0_4
 ; CHECK-NEXT:    .p2align 5
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    add 9, 3, 7
-; CHECK-NEXT:    extswsli 7, 5, 5
+; CHECK-NEXT:    add 9, 3, 6
+; CHECK-NEXT:    extswsli 6, 5, 5
 ; CHECK-NEXT:    add 5, 8, 5
-; CHECK-NEXT:    lwzu 8, 4(6)
+; CHECK-NEXT:    lwzu 8, 4(7)
 ; CHECK-NEXT:    stdu 9, 8(4)
 ; CHECK-NEXT:    bdnz .LBB0_3
 ; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    add 6, 3, 7
+; CHECK-NEXT:    add 6, 3, 6
 ; CHECK-NEXT:    stdu 6, 8(4)
 ; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    extswsli 5, 5, 5

diff  --git a/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll b/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll
index eec5b4588f7c3..e225e63980c7f 100644
--- a/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll
+++ b/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll
@@ -16,12 +16,13 @@ define dso_local signext i32 @main(i32 signext %argc, ptr nocapture readnone %ar
 ; CHECK-NEXT:    stw 12, 8(1)
 ; CHECK-NEXT:    mflr 0
 ; CHECK-NEXT:    stdu 1, -784(1)
-; CHECK-NEXT:    mr 4, 3
+; CHECK-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-NEXT:    cmpwi 2, 3, 2
+; CHECK-NEXT:    li 4, 0
+; CHECK-NEXT:    # kill: def $r4 killed $r4 killed $x4
+; CHECK-NEXT:    mr 3, 4
 ; CHECK-NEXT:    std 0, 800(1)
 ; CHECK-NEXT:    mr 31, 1
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    # kill: def $r3 killed $r3 killed $x3
-; CHECK-NEXT:    cmpwi 2, 4, 2
 ; CHECK-NEXT:    blt 2, .LBB0_3
 ; CHECK-NEXT:  # %bb.1: # %if.end
 ; CHECK-NEXT:    addi 3, 31, 112
@@ -65,6 +66,7 @@ define dso_local signext i32 @main(i32 signext %argc, ptr nocapture readnone %ar
 ; BE-NEXT:    stdu 1, -800(1)
 ; BE-NEXT:    li 4, 0
 ; BE-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; BE-NEXT:    # kill: def $r4 killed $r4 killed $x4
 ; BE-NEXT:    cmpwi 2, 3, 2
 ; BE-NEXT:    mr 3, 4
 ; BE-NEXT:    std 0, 816(1)

diff  --git a/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll b/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
index 10fa8221778f5..f696745c9d414 100644
--- a/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
+++ b/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
@@ -60,27 +60,27 @@ define void @jbd2_journal_commit_transaction(i32 %input1, ptr %input2, ptr %inpu
 ; CHECK-NO-ISEL-NEXT:    bne- 0, .Ltmp0
 ; CHECK-NO-ISEL-EMPTY:
 ; CHECK-NO-ISEL-NEXT:    #NO_APP
-; CHECK-NO-ISEL-NEXT:    std 4, 0(6)
+; CHECK-NO-ISEL-NEXT:    std 5, 0(6)
 ; CHECK-NO-ISEL-NEXT:    beq- 5, .LBB0_6
 ; CHECK-NO-ISEL-NEXT:  .LBB0_2: # %while.body392
 ; CHECK-NO-ISEL-NEXT:    #
 ; CHECK-NO-ISEL-NEXT:    bne- 1, .LBB0_5
 ; CHECK-NO-ISEL-NEXT:  # %bb.3: # %wait_on_buffer.exit1319
 ; CHECK-NO-ISEL-NEXT:    #
-; CHECK-NO-ISEL-NEXT:    ld 4, 0(6)
-; CHECK-NO-ISEL-NEXT:    mr 9, 4
-; CHECK-NO-ISEL-NEXT:    ldu 5, -72(9)
-; CHECK-NO-ISEL-NEXT:    andi. 5, 5, 1
-; CHECK-NO-ISEL-NEXT:    mr 5, 3
+; CHECK-NO-ISEL-NEXT:    ld 5, 0(6)
+; CHECK-NO-ISEL-NEXT:    mr 9, 5
+; CHECK-NO-ISEL-NEXT:    ldu 4, -72(9)
+; CHECK-NO-ISEL-NEXT:    andi. 4, 4, 1
+; CHECK-NO-ISEL-NEXT:    mr 4, 3
 ; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB0_1
 ; CHECK-NO-ISEL-NEXT:  # %bb.4: # %wait_on_buffer.exit1319
 ; CHECK-NO-ISEL-NEXT:    #
-; CHECK-NO-ISEL-NEXT:    li 5, -5
+; CHECK-NO-ISEL-NEXT:    li 4, -5
 ; CHECK-NO-ISEL-NEXT:    b .LBB0_1
 ; CHECK-NO-ISEL-NEXT:  .LBB0_5:
-; CHECK-NO-ISEL-NEXT:    mr 5, 7
+; CHECK-NO-ISEL-NEXT:    mr 4, 7
 ; CHECK-NO-ISEL-NEXT:  .LBB0_6: # %while.end418
-; CHECK-NO-ISEL-NEXT:    cmplwi 5, 0
+; CHECK-NO-ISEL-NEXT:    cmplwi 4, 0
 ; CHECK-NO-ISEL-NEXT:    beq 0, .LBB0_8
 ; CHECK-NO-ISEL-NEXT:  # %bb.7: # %if.then420
 ; CHECK-NO-ISEL-NEXT:  .LBB0_8: # %if.end421

diff  --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index 9e0dc87f0ab8b..14b3d69f8c273 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -2488,8 +2488,8 @@ define double @test82(double %a, double %b, double %c, double %d) {
 ; CHECK-FISL:       # %bb.0: # %entry
 ; CHECK-FISL-NEXT:    stfd f2, -16(r1) # 8-byte Folded Spill
 ; CHECK-FISL-NEXT:    fmr f2, f1
-; CHECK-FISL-NEXT:    stfd f2, -8(r1) # 8-byte Folded Spill
 ; CHECK-FISL-NEXT:    xscmpudp cr0, f3, f4
+; CHECK-FISL-NEXT:    stfd f2, -8(r1) # 8-byte Folded Spill
 ; CHECK-FISL-NEXT:    beq cr0, .LBB67_2
 ; CHECK-FISL-NEXT:  # %bb.1: # %entry
 ; CHECK-FISL-NEXT:    lfd f0, -16(r1) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index 2133ffeeb19dc..efb4e1a6f15d6 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -543,17 +543,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    lw a2, 4(a2)
 ; RV32I-NEXT:    sltu t0, a6, a5
-; RV32I-NEXT:    mv t3, t0
+; RV32I-NEXT:    mv t4, t0
 ; RV32I-NEXT:    beq a7, t1, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t3, t1, a7
+; RV32I-NEXT:    slt t4, t1, a7
 ; RV32I-NEXT:  .LBB11_2:
 ; RV32I-NEXT:    sltu t2, a1, a3
 ; RV32I-NEXT:    sltu t5, a2, a4
-; RV32I-NEXT:    mv t4, t2
+; RV32I-NEXT:    mv t3, t2
 ; RV32I-NEXT:    beq a4, a2, .LBB11_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv t4, t5
+; RV32I-NEXT:    mv t3, t5
 ; RV32I-NEXT:  .LBB11_4:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
@@ -562,12 +562,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or t6, s0, t6
 ; RV32I-NEXT:    beqz t6, .LBB11_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv t4, t3
+; RV32I-NEXT:    mv t3, t4
 ; RV32I-NEXT:  .LBB11_6:
-; RV32I-NEXT:    mv t3, t2
+; RV32I-NEXT:    mv t4, t2
 ; RV32I-NEXT:    beq a2, a4, .LBB11_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv t3, t5
+; RV32I-NEXT:    mv t4, t5
 ; RV32I-NEXT:  .LBB11_8:
 ; RV32I-NEXT:    sltu t5, a3, a1
 ; RV32I-NEXT:    mv t6, t5
@@ -575,17 +575,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t6, a4, a2
 ; RV32I-NEXT:  .LBB11_10:
-; RV32I-NEXT:    bnez t4, .LBB11_12
+; RV32I-NEXT:    bnez t3, .LBB11_12
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub a7, t1, a7
 ; RV32I-NEXT:    sub a5, a6, a5
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a4, a7, t0
-; RV32I-NEXT:    sltu a6, a5, t3
+; RV32I-NEXT:    sltu a6, a5, t4
 ; RV32I-NEXT:    sub a3, a2, t2
 ; RV32I-NEXT:    sub a2, a4, a6
-; RV32I-NEXT:    sub a4, a5, t3
+; RV32I-NEXT:    sub a4, a5, t4
 ; RV32I-NEXT:    j .LBB11_13
 ; RV32I-NEXT:  .LBB11_12:
 ; RV32I-NEXT:    sltu t0, a5, a6
@@ -639,17 +639,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 4(a2)
 ; RV32ZBB-NEXT:    sltu t0, a6, a5
-; RV32ZBB-NEXT:    mv t3, t0
+; RV32ZBB-NEXT:    mv t4, t0
 ; RV32ZBB-NEXT:    beq a7, t1, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t3, t1, a7
+; RV32ZBB-NEXT:    slt t4, t1, a7
 ; RV32ZBB-NEXT:  .LBB11_2:
 ; RV32ZBB-NEXT:    sltu t2, a1, a3
 ; RV32ZBB-NEXT:    sltu t5, a2, a4
-; RV32ZBB-NEXT:    mv t4, t2
+; RV32ZBB-NEXT:    mv t3, t2
 ; RV32ZBB-NEXT:    beq a4, a2, .LBB11_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    mv t4, t5
+; RV32ZBB-NEXT:    mv t3, t5
 ; RV32ZBB-NEXT:  .LBB11_4:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
@@ -658,12 +658,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    or t6, s0, t6
 ; RV32ZBB-NEXT:    beqz t6, .LBB11_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv t4, t3
+; RV32ZBB-NEXT:    mv t3, t4
 ; RV32ZBB-NEXT:  .LBB11_6:
-; RV32ZBB-NEXT:    mv t3, t2
+; RV32ZBB-NEXT:    mv t4, t2
 ; RV32ZBB-NEXT:    beq a2, a4, .LBB11_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    mv t3, t5
+; RV32ZBB-NEXT:    mv t4, t5
 ; RV32ZBB-NEXT:  .LBB11_8:
 ; RV32ZBB-NEXT:    sltu t5, a3, a1
 ; RV32ZBB-NEXT:    mv t6, t5
@@ -671,17 +671,17 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t6, a4, a2
 ; RV32ZBB-NEXT:  .LBB11_10:
-; RV32ZBB-NEXT:    bnez t4, .LBB11_12
+; RV32ZBB-NEXT:    bnez t3, .LBB11_12
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub a7, t1, a7
 ; RV32ZBB-NEXT:    sub a5, a6, a5
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    sub a4, a7, t0
-; RV32ZBB-NEXT:    sltu a6, a5, t3
+; RV32ZBB-NEXT:    sltu a6, a5, t4
 ; RV32ZBB-NEXT:    sub a3, a2, t2
 ; RV32ZBB-NEXT:    sub a2, a4, a6
-; RV32ZBB-NEXT:    sub a4, a5, t3
+; RV32ZBB-NEXT:    sub a4, a5, t4
 ; RV32ZBB-NEXT:    j .LBB11_13
 ; RV32ZBB-NEXT:  .LBB11_12:
 ; RV32ZBB-NEXT:    sltu t0, a5, a6
@@ -743,17 +743,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    lw a2, 4(a2)
 ; RV32I-NEXT:    sltu t0, a6, a5
-; RV32I-NEXT:    mv t3, t0
+; RV32I-NEXT:    mv t4, t0
 ; RV32I-NEXT:    beq a7, t1, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t3, t1, a7
+; RV32I-NEXT:    slt t4, t1, a7
 ; RV32I-NEXT:  .LBB12_2:
 ; RV32I-NEXT:    sltu t2, a1, a3
 ; RV32I-NEXT:    sltu t5, a2, a4
-; RV32I-NEXT:    mv t4, t2
+; RV32I-NEXT:    mv t3, t2
 ; RV32I-NEXT:    beq a4, a2, .LBB12_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv t4, t5
+; RV32I-NEXT:    mv t3, t5
 ; RV32I-NEXT:  .LBB12_4:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
@@ -762,12 +762,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or t6, s0, t6
 ; RV32I-NEXT:    beqz t6, .LBB12_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv t4, t3
+; RV32I-NEXT:    mv t3, t4
 ; RV32I-NEXT:  .LBB12_6:
-; RV32I-NEXT:    mv t3, t2
+; RV32I-NEXT:    mv t4, t2
 ; RV32I-NEXT:    beq a2, a4, .LBB12_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv t3, t5
+; RV32I-NEXT:    mv t4, t5
 ; RV32I-NEXT:  .LBB12_8:
 ; RV32I-NEXT:    sltu t5, a3, a1
 ; RV32I-NEXT:    mv t6, t5
@@ -775,17 +775,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t6, a4, a2
 ; RV32I-NEXT:  .LBB12_10:
-; RV32I-NEXT:    bnez t4, .LBB12_12
+; RV32I-NEXT:    bnez t3, .LBB12_12
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub a7, t1, a7
 ; RV32I-NEXT:    sub a5, a6, a5
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a4, a7, t0
-; RV32I-NEXT:    sltu a6, a5, t3
+; RV32I-NEXT:    sltu a6, a5, t4
 ; RV32I-NEXT:    sub a3, a2, t2
 ; RV32I-NEXT:    sub a2, a4, a6
-; RV32I-NEXT:    sub a4, a5, t3
+; RV32I-NEXT:    sub a4, a5, t4
 ; RV32I-NEXT:    j .LBB12_13
 ; RV32I-NEXT:  .LBB12_12:
 ; RV32I-NEXT:    sltu t0, a5, a6
@@ -839,17 +839,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 4(a2)
 ; RV32ZBB-NEXT:    sltu t0, a6, a5
-; RV32ZBB-NEXT:    mv t3, t0
+; RV32ZBB-NEXT:    mv t4, t0
 ; RV32ZBB-NEXT:    beq a7, t1, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t3, t1, a7
+; RV32ZBB-NEXT:    slt t4, t1, a7
 ; RV32ZBB-NEXT:  .LBB12_2:
 ; RV32ZBB-NEXT:    sltu t2, a1, a3
 ; RV32ZBB-NEXT:    sltu t5, a2, a4
-; RV32ZBB-NEXT:    mv t4, t2
+; RV32ZBB-NEXT:    mv t3, t2
 ; RV32ZBB-NEXT:    beq a4, a2, .LBB12_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    mv t4, t5
+; RV32ZBB-NEXT:    mv t3, t5
 ; RV32ZBB-NEXT:  .LBB12_4:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
@@ -858,12 +858,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    or t6, s0, t6
 ; RV32ZBB-NEXT:    beqz t6, .LBB12_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv t4, t3
+; RV32ZBB-NEXT:    mv t3, t4
 ; RV32ZBB-NEXT:  .LBB12_6:
-; RV32ZBB-NEXT:    mv t3, t2
+; RV32ZBB-NEXT:    mv t4, t2
 ; RV32ZBB-NEXT:    beq a2, a4, .LBB12_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    mv t3, t5
+; RV32ZBB-NEXT:    mv t4, t5
 ; RV32ZBB-NEXT:  .LBB12_8:
 ; RV32ZBB-NEXT:    sltu t5, a3, a1
 ; RV32ZBB-NEXT:    mv t6, t5
@@ -871,17 +871,17 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t6, a4, a2
 ; RV32ZBB-NEXT:  .LBB12_10:
-; RV32ZBB-NEXT:    bnez t4, .LBB12_12
+; RV32ZBB-NEXT:    bnez t3, .LBB12_12
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub a7, t1, a7
 ; RV32ZBB-NEXT:    sub a5, a6, a5
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    sub a4, a7, t0
-; RV32ZBB-NEXT:    sltu a6, a5, t3
+; RV32ZBB-NEXT:    sltu a6, a5, t4
 ; RV32ZBB-NEXT:    sub a3, a2, t2
 ; RV32ZBB-NEXT:    sub a2, a4, a6
-; RV32ZBB-NEXT:    sub a4, a5, t3
+; RV32ZBB-NEXT:    sub a4, a5, t4
 ; RV32ZBB-NEXT:    j .LBB12_13
 ; RV32ZBB-NEXT:  .LBB12_12:
 ; RV32ZBB-NEXT:    sltu t0, a5, a6
@@ -1132,17 +1132,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    lw a2, 4(a2)
 ; RV32I-NEXT:    sltu t0, a6, a5
-; RV32I-NEXT:    mv t3, t0
+; RV32I-NEXT:    mv t4, t0
 ; RV32I-NEXT:    beq a7, t1, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t3, t1, a7
+; RV32I-NEXT:    slt t4, t1, a7
 ; RV32I-NEXT:  .LBB17_2:
 ; RV32I-NEXT:    sltu t2, a1, a3
 ; RV32I-NEXT:    sltu t5, a2, a4
-; RV32I-NEXT:    mv t4, t2
+; RV32I-NEXT:    mv t3, t2
 ; RV32I-NEXT:    beq a4, a2, .LBB17_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv t4, t5
+; RV32I-NEXT:    mv t3, t5
 ; RV32I-NEXT:  .LBB17_4:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
@@ -1151,12 +1151,12 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or t6, s0, t6
 ; RV32I-NEXT:    beqz t6, .LBB17_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv t4, t3
+; RV32I-NEXT:    mv t3, t4
 ; RV32I-NEXT:  .LBB17_6:
-; RV32I-NEXT:    mv t3, t2
+; RV32I-NEXT:    mv t4, t2
 ; RV32I-NEXT:    beq a2, a4, .LBB17_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv t3, t5
+; RV32I-NEXT:    mv t4, t5
 ; RV32I-NEXT:  .LBB17_8:
 ; RV32I-NEXT:    sltu t5, a3, a1
 ; RV32I-NEXT:    mv t6, t5
@@ -1164,17 +1164,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t6, a4, a2
 ; RV32I-NEXT:  .LBB17_10:
-; RV32I-NEXT:    bnez t4, .LBB17_12
+; RV32I-NEXT:    bnez t3, .LBB17_12
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub a7, t1, a7
 ; RV32I-NEXT:    sub a5, a6, a5
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a4, a7, t0
-; RV32I-NEXT:    sltu a6, a5, t3
+; RV32I-NEXT:    sltu a6, a5, t4
 ; RV32I-NEXT:    sub a3, a2, t2
 ; RV32I-NEXT:    sub a2, a4, a6
-; RV32I-NEXT:    sub a4, a5, t3
+; RV32I-NEXT:    sub a4, a5, t4
 ; RV32I-NEXT:    j .LBB17_13
 ; RV32I-NEXT:  .LBB17_12:
 ; RV32I-NEXT:    sltu t0, a5, a6
@@ -1228,17 +1228,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 4(a2)
 ; RV32ZBB-NEXT:    sltu t0, a6, a5
-; RV32ZBB-NEXT:    mv t3, t0
+; RV32ZBB-NEXT:    mv t4, t0
 ; RV32ZBB-NEXT:    beq a7, t1, .LBB17_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t3, t1, a7
+; RV32ZBB-NEXT:    slt t4, t1, a7
 ; RV32ZBB-NEXT:  .LBB17_2:
 ; RV32ZBB-NEXT:    sltu t2, a1, a3
 ; RV32ZBB-NEXT:    sltu t5, a2, a4
-; RV32ZBB-NEXT:    mv t4, t2
+; RV32ZBB-NEXT:    mv t3, t2
 ; RV32ZBB-NEXT:    beq a4, a2, .LBB17_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    mv t4, t5
+; RV32ZBB-NEXT:    mv t3, t5
 ; RV32ZBB-NEXT:  .LBB17_4:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
@@ -1247,12 +1247,12 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    or t6, s0, t6
 ; RV32ZBB-NEXT:    beqz t6, .LBB17_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv t4, t3
+; RV32ZBB-NEXT:    mv t3, t4
 ; RV32ZBB-NEXT:  .LBB17_6:
-; RV32ZBB-NEXT:    mv t3, t2
+; RV32ZBB-NEXT:    mv t4, t2
 ; RV32ZBB-NEXT:    beq a2, a4, .LBB17_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    mv t3, t5
+; RV32ZBB-NEXT:    mv t4, t5
 ; RV32ZBB-NEXT:  .LBB17_8:
 ; RV32ZBB-NEXT:    sltu t5, a3, a1
 ; RV32ZBB-NEXT:    mv t6, t5
@@ -1260,17 +1260,17 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t6, a4, a2
 ; RV32ZBB-NEXT:  .LBB17_10:
-; RV32ZBB-NEXT:    bnez t4, .LBB17_12
+; RV32ZBB-NEXT:    bnez t3, .LBB17_12
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub a7, t1, a7
 ; RV32ZBB-NEXT:    sub a5, a6, a5
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    sub a4, a7, t0
-; RV32ZBB-NEXT:    sltu a6, a5, t3
+; RV32ZBB-NEXT:    sltu a6, a5, t4
 ; RV32ZBB-NEXT:    sub a3, a2, t2
 ; RV32ZBB-NEXT:    sub a2, a4, a6
-; RV32ZBB-NEXT:    sub a4, a5, t3
+; RV32ZBB-NEXT:    sub a4, a5, t4
 ; RV32ZBB-NEXT:    j .LBB17_13
 ; RV32ZBB-NEXT:  .LBB17_12:
 ; RV32ZBB-NEXT:    sltu t0, a5, a6
@@ -1523,17 +1523,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    lw a2, 4(a2)
 ; RV32I-NEXT:    sltu t0, a6, a5
-; RV32I-NEXT:    mv t3, t0
+; RV32I-NEXT:    mv t4, t0
 ; RV32I-NEXT:    beq a7, t1, .LBB22_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t3, t1, a7
+; RV32I-NEXT:    slt t4, t1, a7
 ; RV32I-NEXT:  .LBB22_2:
 ; RV32I-NEXT:    sltu t2, a1, a3
 ; RV32I-NEXT:    sltu t5, a2, a4
-; RV32I-NEXT:    mv t4, t2
+; RV32I-NEXT:    mv t3, t2
 ; RV32I-NEXT:    beq a4, a2, .LBB22_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv t4, t5
+; RV32I-NEXT:    mv t3, t5
 ; RV32I-NEXT:  .LBB22_4:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
@@ -1542,12 +1542,12 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or t6, s0, t6
 ; RV32I-NEXT:    beqz t6, .LBB22_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv t4, t3
+; RV32I-NEXT:    mv t3, t4
 ; RV32I-NEXT:  .LBB22_6:
-; RV32I-NEXT:    mv t3, t2
+; RV32I-NEXT:    mv t4, t2
 ; RV32I-NEXT:    beq a2, a4, .LBB22_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv t3, t5
+; RV32I-NEXT:    mv t4, t5
 ; RV32I-NEXT:  .LBB22_8:
 ; RV32I-NEXT:    sltu t5, a3, a1
 ; RV32I-NEXT:    mv t6, t5
@@ -1555,17 +1555,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t6, a4, a2
 ; RV32I-NEXT:  .LBB22_10:
-; RV32I-NEXT:    bnez t4, .LBB22_12
+; RV32I-NEXT:    bnez t3, .LBB22_12
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub a7, t1, a7
 ; RV32I-NEXT:    sub a5, a6, a5
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a4, a7, t0
-; RV32I-NEXT:    sltu a6, a5, t3
+; RV32I-NEXT:    sltu a6, a5, t4
 ; RV32I-NEXT:    sub a3, a2, t2
 ; RV32I-NEXT:    sub a2, a4, a6
-; RV32I-NEXT:    sub a4, a5, t3
+; RV32I-NEXT:    sub a4, a5, t4
 ; RV32I-NEXT:    j .LBB22_13
 ; RV32I-NEXT:  .LBB22_12:
 ; RV32I-NEXT:    sltu t0, a5, a6
@@ -1619,17 +1619,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 4(a2)
 ; RV32ZBB-NEXT:    sltu t0, a6, a5
-; RV32ZBB-NEXT:    mv t3, t0
+; RV32ZBB-NEXT:    mv t4, t0
 ; RV32ZBB-NEXT:    beq a7, t1, .LBB22_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t3, t1, a7
+; RV32ZBB-NEXT:    slt t4, t1, a7
 ; RV32ZBB-NEXT:  .LBB22_2:
 ; RV32ZBB-NEXT:    sltu t2, a1, a3
 ; RV32ZBB-NEXT:    sltu t5, a2, a4
-; RV32ZBB-NEXT:    mv t4, t2
+; RV32ZBB-NEXT:    mv t3, t2
 ; RV32ZBB-NEXT:    beq a4, a2, .LBB22_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    mv t4, t5
+; RV32ZBB-NEXT:    mv t3, t5
 ; RV32ZBB-NEXT:  .LBB22_4:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
@@ -1638,12 +1638,12 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    or t6, s0, t6
 ; RV32ZBB-NEXT:    beqz t6, .LBB22_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv t4, t3
+; RV32ZBB-NEXT:    mv t3, t4
 ; RV32ZBB-NEXT:  .LBB22_6:
-; RV32ZBB-NEXT:    mv t3, t2
+; RV32ZBB-NEXT:    mv t4, t2
 ; RV32ZBB-NEXT:    beq a2, a4, .LBB22_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    mv t3, t5
+; RV32ZBB-NEXT:    mv t4, t5
 ; RV32ZBB-NEXT:  .LBB22_8:
 ; RV32ZBB-NEXT:    sltu t5, a3, a1
 ; RV32ZBB-NEXT:    mv t6, t5
@@ -1651,17 +1651,17 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t6, a4, a2
 ; RV32ZBB-NEXT:  .LBB22_10:
-; RV32ZBB-NEXT:    bnez t4, .LBB22_12
+; RV32ZBB-NEXT:    bnez t3, .LBB22_12
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub a7, t1, a7
 ; RV32ZBB-NEXT:    sub a5, a6, a5
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    sub a4, a7, t0
-; RV32ZBB-NEXT:    sltu a6, a5, t3
+; RV32ZBB-NEXT:    sltu a6, a5, t4
 ; RV32ZBB-NEXT:    sub a3, a2, t2
 ; RV32ZBB-NEXT:    sub a2, a4, a6
-; RV32ZBB-NEXT:    sub a4, a5, t3
+; RV32ZBB-NEXT:    sub a4, a5, t4
 ; RV32ZBB-NEXT:    j .LBB22_13
 ; RV32ZBB-NEXT:  .LBB22_12:
 ; RV32ZBB-NEXT:    sltu t0, a5, a6
@@ -2546,17 +2546,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw t1, 12(a2)
 ; RV32I-NEXT:    lw a2, 4(a2)
 ; RV32I-NEXT:    sltu t0, a6, a5
-; RV32I-NEXT:    mv t3, t0
+; RV32I-NEXT:    mv t4, t0
 ; RV32I-NEXT:    beq a7, t1, .LBB38_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slt t3, t1, a7
+; RV32I-NEXT:    slt t4, t1, a7
 ; RV32I-NEXT:  .LBB38_2:
 ; RV32I-NEXT:    sltu t2, a1, a3
 ; RV32I-NEXT:    sltu t5, a2, a4
-; RV32I-NEXT:    mv t4, t2
+; RV32I-NEXT:    mv t3, t2
 ; RV32I-NEXT:    beq a4, a2, .LBB38_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv t4, t5
+; RV32I-NEXT:    mv t3, t5
 ; RV32I-NEXT:  .LBB38_4:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
@@ -2565,12 +2565,12 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or t6, s0, t6
 ; RV32I-NEXT:    beqz t6, .LBB38_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv t4, t3
+; RV32I-NEXT:    mv t3, t4
 ; RV32I-NEXT:  .LBB38_6:
-; RV32I-NEXT:    mv t3, t2
+; RV32I-NEXT:    mv t4, t2
 ; RV32I-NEXT:    beq a2, a4, .LBB38_8
 ; RV32I-NEXT:  # %bb.7:
-; RV32I-NEXT:    mv t3, t5
+; RV32I-NEXT:    mv t4, t5
 ; RV32I-NEXT:  .LBB38_8:
 ; RV32I-NEXT:    sltu t5, a3, a1
 ; RV32I-NEXT:    mv t6, t5
@@ -2578,17 +2578,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t6, a4, a2
 ; RV32I-NEXT:  .LBB38_10:
-; RV32I-NEXT:    bnez t4, .LBB38_12
+; RV32I-NEXT:    bnez t3, .LBB38_12
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub a7, t1, a7
 ; RV32I-NEXT:    sub a5, a6, a5
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a4, a7, t0
-; RV32I-NEXT:    sltu a6, a5, t3
+; RV32I-NEXT:    sltu a6, a5, t4
 ; RV32I-NEXT:    sub a3, a2, t2
 ; RV32I-NEXT:    sub a2, a4, a6
-; RV32I-NEXT:    sub a4, a5, t3
+; RV32I-NEXT:    sub a4, a5, t4
 ; RV32I-NEXT:    j .LBB38_13
 ; RV32I-NEXT:  .LBB38_12:
 ; RV32I-NEXT:    sltu t0, a5, a6
@@ -2642,17 +2642,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw t1, 12(a2)
 ; RV32ZBB-NEXT:    lw a2, 4(a2)
 ; RV32ZBB-NEXT:    sltu t0, a6, a5
-; RV32ZBB-NEXT:    mv t3, t0
+; RV32ZBB-NEXT:    mv t4, t0
 ; RV32ZBB-NEXT:    beq a7, t1, .LBB38_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    slt t3, t1, a7
+; RV32ZBB-NEXT:    slt t4, t1, a7
 ; RV32ZBB-NEXT:  .LBB38_2:
 ; RV32ZBB-NEXT:    sltu t2, a1, a3
 ; RV32ZBB-NEXT:    sltu t5, a2, a4
-; RV32ZBB-NEXT:    mv t4, t2
+; RV32ZBB-NEXT:    mv t3, t2
 ; RV32ZBB-NEXT:    beq a4, a2, .LBB38_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    mv t4, t5
+; RV32ZBB-NEXT:    mv t3, t5
 ; RV32ZBB-NEXT:  .LBB38_4:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
@@ -2661,12 +2661,12 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    or t6, s0, t6
 ; RV32ZBB-NEXT:    beqz t6, .LBB38_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv t4, t3
+; RV32ZBB-NEXT:    mv t3, t4
 ; RV32ZBB-NEXT:  .LBB38_6:
-; RV32ZBB-NEXT:    mv t3, t2
+; RV32ZBB-NEXT:    mv t4, t2
 ; RV32ZBB-NEXT:    beq a2, a4, .LBB38_8
 ; RV32ZBB-NEXT:  # %bb.7:
-; RV32ZBB-NEXT:    mv t3, t5
+; RV32ZBB-NEXT:    mv t4, t5
 ; RV32ZBB-NEXT:  .LBB38_8:
 ; RV32ZBB-NEXT:    sltu t5, a3, a1
 ; RV32ZBB-NEXT:    mv t6, t5
@@ -2674,17 +2674,17 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t6, a4, a2
 ; RV32ZBB-NEXT:  .LBB38_10:
-; RV32ZBB-NEXT:    bnez t4, .LBB38_12
+; RV32ZBB-NEXT:    bnez t3, .LBB38_12
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub a7, t1, a7
 ; RV32ZBB-NEXT:    sub a5, a6, a5
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    sub a4, a7, t0
-; RV32ZBB-NEXT:    sltu a6, a5, t3
+; RV32ZBB-NEXT:    sltu a6, a5, t4
 ; RV32ZBB-NEXT:    sub a3, a2, t2
 ; RV32ZBB-NEXT:    sub a2, a4, a6
-; RV32ZBB-NEXT:    sub a4, a5, t3
+; RV32ZBB-NEXT:    sub a4, a5, t4
 ; RV32ZBB-NEXT:    j .LBB38_13
 ; RV32ZBB-NEXT:  .LBB38_12:
 ; RV32ZBB-NEXT:    sltu t0, a5, a6

diff  --git a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
index bf81d03ec1352..d250098576687 100644
--- a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
@@ -52,18 +52,18 @@ define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cn
 ;
 ; CHECK-PIPELINED-LABEL: test_pipelined_1:
 ; CHECK-PIPELINED:       # %bb.0: # %entry
-; CHECK-PIPELINED-NEXT:    blez a2, .LBB1_7
+; CHECK-PIPELINED-NEXT:    blez a2, .LBB1_6
 ; CHECK-PIPELINED-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-PIPELINED-NEXT:    lw a4, 0(a1)
 ; CHECK-PIPELINED-NEXT:    addi a2, a2, -1
-; CHECK-PIPELINED-NEXT:    addi a3, a0, 4
 ; CHECK-PIPELINED-NEXT:    sh2add.uw a6, a2, a1
+; CHECK-PIPELINED-NEXT:    addi a2, a0, 4
 ; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
 ; CHECK-PIPELINED-NEXT:    addi a6, a6, 4
 ; CHECK-PIPELINED-NEXT:    beq a1, a6, .LBB1_5
 ; CHECK-PIPELINED-NEXT:  # %bb.2: # %for.body
 ; CHECK-PIPELINED-NEXT:    lw a5, 0(a1)
-; CHECK-PIPELINED-NEXT:    addi a2, a3, 4
+; CHECK-PIPELINED-NEXT:    addi a3, a2, 4
 ; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
 ; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
 ; CHECK-PIPELINED-NEXT:    beq a1, a6, .LBB1_4
@@ -72,22 +72,20 @@ define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cn
 ; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
 ; CHECK-PIPELINED-NEXT:    mv a4, a5
 ; CHECK-PIPELINED-NEXT:    lw a5, 0(a1)
-; CHECK-PIPELINED-NEXT:    mv a0, a3
-; CHECK-PIPELINED-NEXT:    mv a3, a2
-; CHECK-PIPELINED-NEXT:    addi a2, a2, 4
+; CHECK-PIPELINED-NEXT:    mv a0, a2
+; CHECK-PIPELINED-NEXT:    mv a2, a3
+; CHECK-PIPELINED-NEXT:    addi a3, a3, 4
 ; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
 ; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
 ; CHECK-PIPELINED-NEXT:    bne a1, a6, .LBB1_3
 ; CHECK-PIPELINED-NEXT:  .LBB1_4:
 ; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
-; CHECK-PIPELINED-NEXT:    j .LBB1_6
+; CHECK-PIPELINED-NEXT:    mv a0, a2
+; CHECK-PIPELINED-NEXT:    mv a4, a5
 ; CHECK-PIPELINED-NEXT:  .LBB1_5:
-; CHECK-PIPELINED-NEXT:    mv a3, a0
-; CHECK-PIPELINED-NEXT:    mv a5, a4
-; CHECK-PIPELINED-NEXT:  .LBB1_6:
-; CHECK-PIPELINED-NEXT:    addi a5, a5, 1
-; CHECK-PIPELINED-NEXT:    sw a5, 0(a3)
-; CHECK-PIPELINED-NEXT:  .LBB1_7: # %for.end
+; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
+; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
+; CHECK-PIPELINED-NEXT:  .LBB1_6: # %for.end
 ; CHECK-PIPELINED-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %cnt, 0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 7fbe60bc9cb34..533b8b6864ebc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -6571,53 +6571,53 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    lw a4, 32(a2)
 ; RV32ZVE32F-NEXT:    lw a5, 40(a2)
-; RV32ZVE32F-NEXT:    lw a7, 48(a2)
-; RV32ZVE32F-NEXT:    lw t0, 56(a2)
-; RV32ZVE32F-NEXT:    lw a6, 0(a2)
+; RV32ZVE32F-NEXT:    lw a6, 48(a2)
+; RV32ZVE32F-NEXT:    lw a7, 56(a2)
+; RV32ZVE32F-NEXT:    lw t0, 0(a2)
 ; RV32ZVE32F-NEXT:    lw t1, 8(a2)
 ; RV32ZVE32F-NEXT:    lw t2, 16(a2)
 ; RV32ZVE32F-NEXT:    lw a2, 24(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.v.x v8, a6
+; RV32ZVE32F-NEXT:    vmv.v.x v8, t0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a6, v0
+; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    andi a2, a6, 1
+; RV32ZVE32F-NEXT:    andi a2, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a2, .LBB57_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV32ZVE32F-NEXT:    lw a1, 0(a2)
 ; RV32ZVE32F-NEXT:    lw a2, 4(a2)
-; RV32ZVE32F-NEXT:    andi a4, a6, 2
+; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    bnez a4, .LBB57_8
 ; RV32ZVE32F-NEXT:  .LBB57_2:
 ; RV32ZVE32F-NEXT:    lw a4, 8(a3)
 ; RV32ZVE32F-NEXT:    lw a5, 12(a3)
-; RV32ZVE32F-NEXT:    andi a7, a6, 4
-; RV32ZVE32F-NEXT:    bnez a7, .LBB57_9
+; RV32ZVE32F-NEXT:    andi a6, t0, 4
+; RV32ZVE32F-NEXT:    bnez a6, .LBB57_9
 ; RV32ZVE32F-NEXT:  .LBB57_3:
-; RV32ZVE32F-NEXT:    lw a7, 16(a3)
-; RV32ZVE32F-NEXT:    lw t0, 20(a3)
-; RV32ZVE32F-NEXT:    andi t1, a6, 8
+; RV32ZVE32F-NEXT:    lw a6, 16(a3)
+; RV32ZVE32F-NEXT:    lw a7, 20(a3)
+; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    bnez t1, .LBB57_10
 ; RV32ZVE32F-NEXT:  .LBB57_4:
 ; RV32ZVE32F-NEXT:    lw t1, 24(a3)
 ; RV32ZVE32F-NEXT:    lw t2, 28(a3)
-; RV32ZVE32F-NEXT:    andi t3, a6, 16
+; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    bnez t3, .LBB57_11
 ; RV32ZVE32F-NEXT:  .LBB57_5:
 ; RV32ZVE32F-NEXT:    lw t3, 32(a3)
 ; RV32ZVE32F-NEXT:    lw t4, 36(a3)
-; RV32ZVE32F-NEXT:    andi t5, a6, 32
+; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    bnez t5, .LBB57_12
 ; RV32ZVE32F-NEXT:  .LBB57_6:
 ; RV32ZVE32F-NEXT:    lw t5, 40(a3)
@@ -6626,7 +6626,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:  .LBB57_7:
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
 ; RV32ZVE32F-NEXT:    lw a2, 4(a3)
-; RV32ZVE32F-NEXT:    andi a4, a6, 2
+; RV32ZVE32F-NEXT:    andi a4, t0, 2
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB57_2
 ; RV32ZVE32F-NEXT:  .LBB57_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -6634,15 +6634,15 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
 ; RV32ZVE32F-NEXT:    lw a4, 0(a5)
 ; RV32ZVE32F-NEXT:    lw a5, 4(a5)
-; RV32ZVE32F-NEXT:    andi a7, a6, 4
-; RV32ZVE32F-NEXT:    beqz a7, .LBB57_3
+; RV32ZVE32F-NEXT:    andi a6, t0, 4
+; RV32ZVE32F-NEXT:    beqz a6, .LBB57_3
 ; RV32ZVE32F-NEXT:  .LBB57_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s t0, v10
-; RV32ZVE32F-NEXT:    lw a7, 0(t0)
-; RV32ZVE32F-NEXT:    lw t0, 4(t0)
-; RV32ZVE32F-NEXT:    andi t1, a6, 8
+; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
+; RV32ZVE32F-NEXT:    lw a6, 0(a7)
+; RV32ZVE32F-NEXT:    lw a7, 4(a7)
+; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB57_4
 ; RV32ZVE32F-NEXT:  .LBB57_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -6650,7 +6650,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
 ; RV32ZVE32F-NEXT:    lw t1, 0(t2)
 ; RV32ZVE32F-NEXT:    lw t2, 4(t2)
-; RV32ZVE32F-NEXT:    andi t3, a6, 16
+; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB57_5
 ; RV32ZVE32F-NEXT:  .LBB57_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -6658,7 +6658,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
 ; RV32ZVE32F-NEXT:    lw t3, 0(t4)
 ; RV32ZVE32F-NEXT:    lw t4, 4(t4)
-; RV32ZVE32F-NEXT:    andi t5, a6, 32
+; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB57_6
 ; RV32ZVE32F-NEXT:  .LBB57_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -6673,7 +6673,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
-; RV32ZVE32F-NEXT:    andi s0, a6, 64
+; RV32ZVE32F-NEXT:    andi s0, t0, 64
 ; RV32ZVE32F-NEXT:    beqz s0, .LBB57_16
 ; RV32ZVE32F-NEXT:  # %bb.14: # %cond.load16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
@@ -6681,30 +6681,30 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
 ; RV32ZVE32F-NEXT:    lw s0, 0(s1)
 ; RV32ZVE32F-NEXT:    lw s1, 4(s1)
-; RV32ZVE32F-NEXT:    andi a6, a6, -128
-; RV32ZVE32F-NEXT:    bnez a6, .LBB57_17
+; RV32ZVE32F-NEXT:    andi t0, t0, -128
+; RV32ZVE32F-NEXT:    bnez t0, .LBB57_17
 ; RV32ZVE32F-NEXT:  .LBB57_15:
-; RV32ZVE32F-NEXT:    lw a6, 56(a3)
+; RV32ZVE32F-NEXT:    lw t0, 56(a3)
 ; RV32ZVE32F-NEXT:    lw a3, 60(a3)
 ; RV32ZVE32F-NEXT:    j .LBB57_18
 ; RV32ZVE32F-NEXT:  .LBB57_16:
 ; RV32ZVE32F-NEXT:    lw s0, 48(a3)
 ; RV32ZVE32F-NEXT:    lw s1, 52(a3)
-; RV32ZVE32F-NEXT:    andi a6, a6, -128
-; RV32ZVE32F-NEXT:    beqz a6, .LBB57_15
+; RV32ZVE32F-NEXT:    andi t0, t0, -128
+; RV32ZVE32F-NEXT:    beqz t0, .LBB57_15
 ; RV32ZVE32F-NEXT:  .LBB57_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
-; RV32ZVE32F-NEXT:    lw a6, 0(a3)
+; RV32ZVE32F-NEXT:    lw t0, 0(a3)
 ; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:  .LBB57_18: # %else20
 ; RV32ZVE32F-NEXT:    sw a1, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:    sw a4, 8(a0)
 ; RV32ZVE32F-NEXT:    sw a5, 12(a0)
-; RV32ZVE32F-NEXT:    sw a7, 16(a0)
-; RV32ZVE32F-NEXT:    sw t0, 20(a0)
+; RV32ZVE32F-NEXT:    sw a6, 16(a0)
+; RV32ZVE32F-NEXT:    sw a7, 20(a0)
 ; RV32ZVE32F-NEXT:    sw t1, 24(a0)
 ; RV32ZVE32F-NEXT:    sw t2, 28(a0)
 ; RV32ZVE32F-NEXT:    sw t3, 32(a0)
@@ -6713,7 +6713,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    sw t6, 44(a0)
 ; RV32ZVE32F-NEXT:    sw s0, 48(a0)
 ; RV32ZVE32F-NEXT:    sw s1, 52(a0)
-; RV32ZVE32F-NEXT:    sw a6, 56(a0)
+; RV32ZVE32F-NEXT:    sw t0, 56(a0)
 ; RV32ZVE32F-NEXT:    sw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -6726,89 +6726,89 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV64ZVE32F-LABEL: mgather_baseidx_v8i64:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a6, v0
-; RV64ZVE32F-NEXT:    andi a4, a6, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a7, v0
+; RV64ZVE32F-NEXT:    andi a4, a7, 1
 ; RV64ZVE32F-NEXT:    beqz a4, .LBB57_9
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV64ZVE32F-NEXT:    ld a4, 0(a2)
 ; RV64ZVE32F-NEXT:    slli a4, a4, 3
 ; RV64ZVE32F-NEXT:    add a4, a1, a4
 ; RV64ZVE32F-NEXT:    ld a4, 0(a4)
-; RV64ZVE32F-NEXT:    andi a5, a6, 2
+; RV64ZVE32F-NEXT:    andi a5, a7, 2
 ; RV64ZVE32F-NEXT:    bnez a5, .LBB57_10
 ; RV64ZVE32F-NEXT:  .LBB57_2:
 ; RV64ZVE32F-NEXT:    ld a5, 8(a3)
-; RV64ZVE32F-NEXT:    andi a7, a6, 4
-; RV64ZVE32F-NEXT:    bnez a7, .LBB57_11
+; RV64ZVE32F-NEXT:    andi a6, a7, 4
+; RV64ZVE32F-NEXT:    bnez a6, .LBB57_11
 ; RV64ZVE32F-NEXT:  .LBB57_3:
-; RV64ZVE32F-NEXT:    ld a7, 16(a3)
-; RV64ZVE32F-NEXT:    andi t0, a6, 8
+; RV64ZVE32F-NEXT:    ld a6, 16(a3)
+; RV64ZVE32F-NEXT:    andi t0, a7, 8
 ; RV64ZVE32F-NEXT:    bnez t0, .LBB57_12
 ; RV64ZVE32F-NEXT:  .LBB57_4:
 ; RV64ZVE32F-NEXT:    ld t0, 24(a3)
-; RV64ZVE32F-NEXT:    andi t1, a6, 16
+; RV64ZVE32F-NEXT:    andi t1, a7, 16
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB57_13
 ; RV64ZVE32F-NEXT:  .LBB57_5:
 ; RV64ZVE32F-NEXT:    ld t1, 32(a3)
-; RV64ZVE32F-NEXT:    andi t2, a6, 32
+; RV64ZVE32F-NEXT:    andi t2, a7, 32
 ; RV64ZVE32F-NEXT:    bnez t2, .LBB57_14
 ; RV64ZVE32F-NEXT:  .LBB57_6:
 ; RV64ZVE32F-NEXT:    ld t2, 40(a3)
-; RV64ZVE32F-NEXT:    andi t3, a6, 64
+; RV64ZVE32F-NEXT:    andi t3, a7, 64
 ; RV64ZVE32F-NEXT:    bnez t3, .LBB57_15
 ; RV64ZVE32F-NEXT:  .LBB57_7:
 ; RV64ZVE32F-NEXT:    ld t3, 48(a3)
-; RV64ZVE32F-NEXT:    andi a6, a6, -128
-; RV64ZVE32F-NEXT:    bnez a6, .LBB57_16
+; RV64ZVE32F-NEXT:    andi a7, a7, -128
+; RV64ZVE32F-NEXT:    bnez a7, .LBB57_16
 ; RV64ZVE32F-NEXT:  .LBB57_8:
 ; RV64ZVE32F-NEXT:    ld a1, 56(a3)
 ; RV64ZVE32F-NEXT:    j .LBB57_17
 ; RV64ZVE32F-NEXT:  .LBB57_9:
 ; RV64ZVE32F-NEXT:    ld a4, 0(a3)
-; RV64ZVE32F-NEXT:    andi a5, a6, 2
+; RV64ZVE32F-NEXT:    andi a5, a7, 2
 ; RV64ZVE32F-NEXT:    beqz a5, .LBB57_2
 ; RV64ZVE32F-NEXT:  .LBB57_10: # %cond.load1
 ; RV64ZVE32F-NEXT:    ld a5, 8(a2)
 ; RV64ZVE32F-NEXT:    slli a5, a5, 3
 ; RV64ZVE32F-NEXT:    add a5, a1, a5
 ; RV64ZVE32F-NEXT:    ld a5, 0(a5)
-; RV64ZVE32F-NEXT:    andi a7, a6, 4
-; RV64ZVE32F-NEXT:    beqz a7, .LBB57_3
+; RV64ZVE32F-NEXT:    andi a6, a7, 4
+; RV64ZVE32F-NEXT:    beqz a6, .LBB57_3
 ; RV64ZVE32F-NEXT:  .LBB57_11: # %cond.load4
-; RV64ZVE32F-NEXT:    ld a7, 16(a2)
-; RV64ZVE32F-NEXT:    slli a7, a7, 3
-; RV64ZVE32F-NEXT:    add a7, a1, a7
-; RV64ZVE32F-NEXT:    ld a7, 0(a7)
-; RV64ZVE32F-NEXT:    andi t0, a6, 8
+; RV64ZVE32F-NEXT:    ld a6, 16(a2)
+; RV64ZVE32F-NEXT:    slli a6, a6, 3
+; RV64ZVE32F-NEXT:    add a6, a1, a6
+; RV64ZVE32F-NEXT:    ld a6, 0(a6)
+; RV64ZVE32F-NEXT:    andi t0, a7, 8
 ; RV64ZVE32F-NEXT:    beqz t0, .LBB57_4
 ; RV64ZVE32F-NEXT:  .LBB57_12: # %cond.load7
 ; RV64ZVE32F-NEXT:    ld t0, 24(a2)
 ; RV64ZVE32F-NEXT:    slli t0, t0, 3
 ; RV64ZVE32F-NEXT:    add t0, a1, t0
 ; RV64ZVE32F-NEXT:    ld t0, 0(t0)
-; RV64ZVE32F-NEXT:    andi t1, a6, 16
+; RV64ZVE32F-NEXT:    andi t1, a7, 16
 ; RV64ZVE32F-NEXT:    beqz t1, .LBB57_5
 ; RV64ZVE32F-NEXT:  .LBB57_13: # %cond.load10
 ; RV64ZVE32F-NEXT:    ld t1, 32(a2)
 ; RV64ZVE32F-NEXT:    slli t1, t1, 3
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
-; RV64ZVE32F-NEXT:    andi t2, a6, 32
+; RV64ZVE32F-NEXT:    andi t2, a7, 32
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB57_6
 ; RV64ZVE32F-NEXT:  .LBB57_14: # %cond.load13
 ; RV64ZVE32F-NEXT:    ld t2, 40(a2)
 ; RV64ZVE32F-NEXT:    slli t2, t2, 3
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    ld t2, 0(t2)
-; RV64ZVE32F-NEXT:    andi t3, a6, 64
+; RV64ZVE32F-NEXT:    andi t3, a7, 64
 ; RV64ZVE32F-NEXT:    beqz t3, .LBB57_7
 ; RV64ZVE32F-NEXT:  .LBB57_15: # %cond.load16
 ; RV64ZVE32F-NEXT:    ld t3, 48(a2)
 ; RV64ZVE32F-NEXT:    slli t3, t3, 3
 ; RV64ZVE32F-NEXT:    add t3, a1, t3
 ; RV64ZVE32F-NEXT:    ld t3, 0(t3)
-; RV64ZVE32F-NEXT:    andi a6, a6, -128
-; RV64ZVE32F-NEXT:    beqz a6, .LBB57_8
+; RV64ZVE32F-NEXT:    andi a7, a7, -128
+; RV64ZVE32F-NEXT:    beqz a7, .LBB57_8
 ; RV64ZVE32F-NEXT:  .LBB57_16: # %cond.load19
 ; RV64ZVE32F-NEXT:    ld a2, 56(a2)
 ; RV64ZVE32F-NEXT:    slli a2, a2, 3
@@ -6817,7 +6817,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV64ZVE32F-NEXT:  .LBB57_17: # %else20
 ; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a5, 8(a0)
-; RV64ZVE32F-NEXT:    sd a7, 16(a0)
+; RV64ZVE32F-NEXT:    sd a6, 16(a0)
 ; RV64ZVE32F-NEXT:    sd t0, 24(a0)
 ; RV64ZVE32F-NEXT:    sd t1, 32(a0)
 ; RV64ZVE32F-NEXT:    sd t2, 40(a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 55729a680c294..07aa05f609c40 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -897,54 +897,55 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
 ; CHECK-NEXT:    beq a2, a3, .LBB14_7
 ; CHECK-NEXT:  # %bb.1: # %bb3
 ; CHECK-NEXT:    li a3, 1023
-; CHECK-NEXT:    subw a4, a3, a2
-; CHECK-NEXT:    li a5, 31
-; CHECK-NEXT:    bltu a4, a5, .LBB14_5
+; CHECK-NEXT:    subw a5, a3, a2
+; CHECK-NEXT:    li a6, 31
+; CHECK-NEXT:    mv a4, a2
+; CHECK-NEXT:    bltu a5, a6, .LBB14_5
 ; CHECK-NEXT:  # %bb.2: # %bb9
-; CHECK-NEXT:    slli a5, a4, 32
-; CHECK-NEXT:    slli a7, a2, 2
-; CHECK-NEXT:    add a4, a0, a2
+; CHECK-NEXT:    slli a4, a5, 32
+; CHECK-NEXT:    slli t0, a2, 2
+; CHECK-NEXT:    add a5, a0, a2
 ; CHECK-NEXT:    add a6, a1, a2
 ; CHECK-NEXT:    li t2, 32
-; CHECK-NEXT:    srli a5, a5, 32
-; CHECK-NEXT:    add a7, a6, a7
-; CHECK-NEXT:    addi a5, a5, 1
-; CHECK-NEXT:    andi a6, a5, -32
-; CHECK-NEXT:    add a2, a6, a2
-; CHECK-NEXT:    add t0, a2, a0
+; CHECK-NEXT:    srli a4, a4, 32
+; CHECK-NEXT:    add t0, a6, t0
+; CHECK-NEXT:    addi a6, a4, 1
+; CHECK-NEXT:    andi a7, a6, -32
+; CHECK-NEXT:    add a4, a7, a2
+; CHECK-NEXT:    add a2, a4, a0
 ; CHECK-NEXT:    li t1, 5
 ; CHECK-NEXT:    vsetvli zero, t2, e8, m1, ta, ma
 ; CHECK-NEXT:  .LBB14_3: # %bb15
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vlse8.v v8, (a7), t1
-; CHECK-NEXT:    vle8.v v9, (a4)
+; CHECK-NEXT:    vlse8.v v8, (t0), t1
+; CHECK-NEXT:    vle8.v v9, (a5)
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vse8.v v8, (a4)
-; CHECK-NEXT:    addi a4, a4, 32
-; CHECK-NEXT:    addi a7, a7, 160
-; CHECK-NEXT:    bne a4, t0, .LBB14_3
+; CHECK-NEXT:    vse8.v v8, (a5)
+; CHECK-NEXT:    addi a5, a5, 32
+; CHECK-NEXT:    addi t0, t0, 160
+; CHECK-NEXT:    bne a5, a2, .LBB14_3
 ; CHECK-NEXT:  # %bb.4: # %bb30
-; CHECK-NEXT:    beq a5, a6, .LBB14_7
+; CHECK-NEXT:    beq a6, a7, .LBB14_7
 ; CHECK-NEXT:  .LBB14_5: # %bb32
-; CHECK-NEXT:    add a4, a0, a2
-; CHECK-NEXT:    slli a5, a2, 2
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    subw a3, a3, a2
+; CHECK-NEXT:    add a2, a0, a4
+; CHECK-NEXT:    slli a5, a4, 2
+; CHECK-NEXT:    add a1, a1, a4
+; CHECK-NEXT:    subw a3, a3, a4
 ; CHECK-NEXT:    add a1, a1, a5
 ; CHECK-NEXT:    slli a3, a3, 32
 ; CHECK-NEXT:    srli a3, a3, 32
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    add a0, a4, a0
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    addi a0, a0, 1
 ; CHECK-NEXT:  .LBB14_6: # %bb35
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lbu a2, 0(a1)
-; CHECK-NEXT:    lbu a3, 0(a4)
-; CHECK-NEXT:    add a2, a3, a2
-; CHECK-NEXT:    sb a2, 0(a4)
-; CHECK-NEXT:    addi a4, a4, 1
+; CHECK-NEXT:    lbu a3, 0(a1)
+; CHECK-NEXT:    lbu a4, 0(a2)
+; CHECK-NEXT:    add a3, a4, a3
+; CHECK-NEXT:    sb a3, 0(a2)
+; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    addi a1, a1, 5
-; CHECK-NEXT:    bne a4, a0, .LBB14_6
+; CHECK-NEXT:    bne a2, a0, .LBB14_6
 ; CHECK-NEXT:  .LBB14_7: # %bb34
 ; CHECK-NEXT:    ret
 bb:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index a9d926b56386a..08cab7cd359b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -29,30 +29,32 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:  # %bb.3: # %for.cond1.preheader.us.preheader
 ; RV32-NEXT:    li t0, 32
 ; RV32-NEXT:  # %bb.4: # %for.cond1.preheader.us.preheader
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
+; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    .cfi_remember_state
 ; RV32-NEXT:    add t3, a0, t3
 ; RV32-NEXT:    add t4, a2, t4
-; RV32-NEXT:    add t5, a4, t5
+; RV32-NEXT:    add s0, a4, t5
 ; RV32-NEXT:    bltu t6, t1, .LBB0_6
 ; RV32-NEXT:  # %bb.5: # %for.cond1.preheader.us.preheader
 ; RV32-NEXT:    li t1, 32
 ; RV32-NEXT:  .LBB0_6: # %for.cond1.preheader.us.preheader
 ; RV32-NEXT:    add t3, t3, a6
-; RV32-NEXT:    add t6, t4, a6
-; RV32-NEXT:    add t4, t5, a6
+; RV32-NEXT:    add t5, t4, a6
+; RV32-NEXT:    add t4, s0, a6
 ; RV32-NEXT:    j .LBB0_8
 ; RV32-NEXT:  # %bb.7: # %for.cond1.preheader.us.preheader
 ; RV32-NEXT:    mv t1, t0
 ; RV32-NEXT:  .LBB0_8: # %for.cond1.preheader.us.preheader
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset s0, -4
-; RV32-NEXT:    .cfi_offset s1, -8
-; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    .cfi_restore_state
 ; RV32-NEXT:    li t0, 0
-; RV32-NEXT:    sltu t5, a0, t6
+; RV32-NEXT:    sltu t5, a0, t5
 ; RV32-NEXT:    sltu t6, a2, t3
 ; RV32-NEXT:    and t5, t5, t6
 ; RV32-NEXT:    sltu t4, a0, t4

diff  --git a/llvm/test/CodeGen/RISCV/xcvbi.ll b/llvm/test/CodeGen/RISCV/xcvbi.ll
index d5d11585970b0..ca2e416e334f0 100644
--- a/llvm/test/CodeGen/RISCV/xcvbi.ll
+++ b/llvm/test/CodeGen/RISCV/xcvbi.ll
@@ -67,14 +67,14 @@ define i32 @select_beqimm_1(i32 %a, i32 %x, i32 %y) {
 ; CHECK_NOPT:       # %bb.0: # %entry
 ; CHECK_NOPT-NEXT:    addi sp, sp, -16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT:    sw a2, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
 ; CHECK_NOPT-NEXT:    cv.beqimm a0, -16, .LBB2_2
 ; CHECK_NOPT-NEXT:  # %bb.1: # %entry
-; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:  .LBB2_2: # %entry
 ; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB2_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; CHECK_NOPT-NEXT:    addi sp, sp, 16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK_NOPT-NEXT:    ret
@@ -98,14 +98,14 @@ define i32 @select_beqimm_2(i32 %a, i32 %x, i32 %y) {
 ; CHECK_NOPT:       # %bb.0: # %entry
 ; CHECK_NOPT-NEXT:    addi sp, sp, -16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT:    sw a2, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
 ; CHECK_NOPT-NEXT:    cv.beqimm a0, 0, .LBB3_2
 ; CHECK_NOPT-NEXT:  # %bb.1: # %entry
-; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:  .LBB3_2: # %entry
 ; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB3_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; CHECK_NOPT-NEXT:    addi sp, sp, 16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK_NOPT-NEXT:    ret
@@ -129,14 +129,14 @@ define i32 @select_beqimm_3(i32 %a, i32 %x, i32 %y) {
 ; CHECK_NOPT:       # %bb.0: # %entry
 ; CHECK_NOPT-NEXT:    addi sp, sp, -16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT:    sw a2, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
 ; CHECK_NOPT-NEXT:    cv.beqimm a0, 15, .LBB4_2
 ; CHECK_NOPT-NEXT:  # %bb.1: # %entry
-; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:  .LBB4_2: # %entry
 ; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB4_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; CHECK_NOPT-NEXT:    addi sp, sp, 16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK_NOPT-NEXT:    ret
@@ -160,15 +160,15 @@ define i32 @select_no_beqimm_1(i32 %a, i32 %x, i32 %y) {
 ; CHECK_NOPT:       # %bb.0: # %entry
 ; CHECK_NOPT-NEXT:    addi sp, sp, -16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT:    sw a2, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
 ; CHECK_NOPT-NEXT:    li a1, -17
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
 ; CHECK_NOPT-NEXT:    beq a0, a1, .LBB5_2
 ; CHECK_NOPT-NEXT:  # %bb.1: # %entry
-; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:  .LBB5_2: # %entry
 ; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB5_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; CHECK_NOPT-NEXT:    addi sp, sp, 16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK_NOPT-NEXT:    ret
@@ -193,15 +193,15 @@ define i32 @select_no_beqimm_2(i32 %a, i32 %x, i32 %y) {
 ; CHECK_NOPT:       # %bb.0: # %entry
 ; CHECK_NOPT-NEXT:    addi sp, sp, -16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT:    sw a2, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
 ; CHECK_NOPT-NEXT:    li a1, 16
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
 ; CHECK_NOPT-NEXT:    beq a0, a1, .LBB6_2
 ; CHECK_NOPT-NEXT:  # %bb.1: # %entry
-; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:  .LBB6_2: # %entry
 ; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB6_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; CHECK_NOPT-NEXT:    addi sp, sp, 16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK_NOPT-NEXT:    ret
@@ -226,14 +226,14 @@ define i32 @select_bneimm_1(i32 %a, i32 %x, i32 %y) {
 ; CHECK_NOPT:       # %bb.0: # %entry
 ; CHECK_NOPT-NEXT:    addi sp, sp, -16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 16
-; CHECK_NOPT-NEXT:    sw a2, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
 ; CHECK_NOPT-NEXT:    cv.bneimm a0, 0, .LBB7_2
 ; CHECK_NOPT-NEXT:  # %bb.1: # %entry
-; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; CHECK_NOPT-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; CHECK_NOPT-NEXT:  .LBB7_2: # %entry
 ; CHECK_NOPT-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; CHECK_NOPT-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
+; CHECK_NOPT-NEXT:  .LBB7_2: # %entry
+; CHECK_NOPT-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; CHECK_NOPT-NEXT:    addi sp, sp, 16
 ; CHECK_NOPT-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK_NOPT-NEXT:    ret

diff  --git a/llvm/test/CodeGen/SystemZ/swifterror.ll b/llvm/test/CodeGen/SystemZ/swifterror.ll
index 39f0907295ff4..1b18287cac146 100644
--- a/llvm/test/CodeGen/SystemZ/swifterror.ll
+++ b/llvm/test/CodeGen/SystemZ/swifterror.ll
@@ -162,8 +162,8 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-O0: je
 ; CHECK-O0: lghi %r2, 16
 ; CHECK-O0: brasl %r14, malloc
-; CHECK-O0: lgr %r{{[0-9]+}}, %r2
-; CHECK-O0: mvi 8(%r2), 1
+; CHECK-O0: lgr %r[[REG1:[0-9]+]], %r2
+; CHECK-O0: mvi 8(%r[[REG1]]), 1
 ; CHECK-O0: jnh
 ; reload from stack
 ; CHECK-O0: lg %r9, [[OFFS:[0-9]+]](%r15)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index 08b99c67d9d55..8a5a15a57912c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -328,14 +328,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly
 ; CHECK-LABEL: test_vec_mul_scalar_add_char:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    ldr.w r12, [sp, #28]
-; CHECK-NEXT:    cmp.w r12, #0
+; CHECK-NEXT:    ldr r4, [sp, #28]
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    beq.w .LBB5_11
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT:    add.w r7, r1, r12
-; CHECK-NEXT:    add.w r6, r3, r12, lsl #2
+; CHECK-NEXT:    adds r7, r1, r4
+; CHECK-NEXT:    add.w r6, r3, r4, lsl #2
 ; CHECK-NEXT:    cmp r7, r3
-; CHECK-NEXT:    add.w r5, r0, r12
+; CHECK-NEXT:    add.w r5, r0, r4
 ; CHECK-NEXT:    cset r7, hi
 ; CHECK-NEXT:    cmp r6, r1
 ; CHECK-NEXT:    csel r7, zr, r7, ls
@@ -348,15 +348,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly
 ; CHECK-NEXT:    cmpeq r7, #0
 ; CHECK-NEXT:    beq .LBB5_4
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT:    and r8, r12, #3
-; CHECK-NEXT:    sub.w r7, r12, #1
+; CHECK-NEXT:    and r12, r4, #3
+; CHECK-NEXT:    subs r7, r4, #1
 ; CHECK-NEXT:    cmp r7, #3
 ; CHECK-NEXT:    bhs .LBB5_6
 ; CHECK-NEXT:  @ %bb.3:
-; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    b .LBB5_8
 ; CHECK-NEXT:  .LBB5_4: @ %vector.ph
-; CHECK-NEXT:    dlstp.32 lr, r12
+; CHECK-NEXT:    dlstp.32 lr, r4
 ; CHECK-NEXT:  .LBB5_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u32 q0, [r0], #4
@@ -366,18 +366,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly
 ; CHECK-NEXT:    letp lr, .LBB5_5
 ; CHECK-NEXT:    b .LBB5_11
 ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader.new
-; CHECK-NEXT:    bic r7, r12, #3
+; CHECK-NEXT:    bic r7, r4, #3
 ; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    subs r7, #4
 ; CHECK-NEXT:    add.w r5, r3, #8
-; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
 ; CHECK-NEXT:    adds r6, r0, #3
 ; CHECK-NEXT:    adds r7, r1, #1
 ; CHECK-NEXT:  .LBB5_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb r9, [r6, #-3]
-; CHECK-NEXT:    add.w r12, r12, #4
+; CHECK-NEXT:    add.w r8, r8, #4
 ; CHECK-NEXT:    ldrb r4, [r7, #-1]
 ; CHECK-NEXT:    smlabb r4, r4, r9, r2
 ; CHECK-NEXT:    str r4, [r5, #-8]
@@ -396,11 +396,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly
 ; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    le lr, .LBB5_7
 ; CHECK-NEXT:  .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT:    wls lr, r8, .LBB5_11
+; CHECK-NEXT:    wls lr, r12, .LBB5_11
 ; CHECK-NEXT:  @ %bb.9: @ %for.body.epil.preheader
-; CHECK-NEXT:    add r0, r12
-; CHECK-NEXT:    add r1, r12
-; CHECK-NEXT:    add.w r3, r3, r12, lsl #2
+; CHECK-NEXT:    add r0, r8
+; CHECK-NEXT:    add r1, r8
+; CHECK-NEXT:    add.w r3, r3, r8, lsl #2
 ; CHECK-NEXT:  .LBB5_10: @ %for.body.epil
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb r7, [r0], #1
@@ -604,14 +604,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl
 ; CHECK-LABEL: test_vec_mul_scalar_add_uchar:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    ldr.w r12, [sp, #28]
-; CHECK-NEXT:    cmp.w r12, #0
+; CHECK-NEXT:    ldr r4, [sp, #28]
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    beq.w .LBB7_11
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT:    add.w r7, r1, r12
-; CHECK-NEXT:    add.w r6, r3, r12, lsl #2
+; CHECK-NEXT:    adds r7, r1, r4
+; CHECK-NEXT:    add.w r6, r3, r4, lsl #2
 ; CHECK-NEXT:    cmp r7, r3
-; CHECK-NEXT:    add.w r5, r0, r12
+; CHECK-NEXT:    add.w r5, r0, r4
 ; CHECK-NEXT:    cset r7, hi
 ; CHECK-NEXT:    cmp r6, r1
 ; CHECK-NEXT:    csel r7, zr, r7, ls
@@ -624,15 +624,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl
 ; CHECK-NEXT:    cmpeq r7, #0
 ; CHECK-NEXT:    beq .LBB7_4
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT:    and r8, r12, #3
-; CHECK-NEXT:    sub.w r7, r12, #1
+; CHECK-NEXT:    and r12, r4, #3
+; CHECK-NEXT:    subs r7, r4, #1
 ; CHECK-NEXT:    cmp r7, #3
 ; CHECK-NEXT:    bhs .LBB7_6
 ; CHECK-NEXT:  @ %bb.3:
-; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    b .LBB7_8
 ; CHECK-NEXT:  .LBB7_4: @ %vector.ph
-; CHECK-NEXT:    dlstp.32 lr, r12
+; CHECK-NEXT:    dlstp.32 lr, r4
 ; CHECK-NEXT:  .LBB7_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrb.u32 q0, [r0], #4
@@ -642,18 +642,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl
 ; CHECK-NEXT:    letp lr, .LBB7_5
 ; CHECK-NEXT:    b .LBB7_11
 ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader.new
-; CHECK-NEXT:    bic r7, r12, #3
+; CHECK-NEXT:    bic r7, r4, #3
 ; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    subs r7, #4
 ; CHECK-NEXT:    add.w r5, r3, #8
-; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
 ; CHECK-NEXT:    adds r6, r0, #3
 ; CHECK-NEXT:    adds r7, r1, #1
 ; CHECK-NEXT:  .LBB7_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb r9, [r6, #-3]
-; CHECK-NEXT:    add.w r12, r12, #4
+; CHECK-NEXT:    add.w r8, r8, #4
 ; CHECK-NEXT:    ldrb r4, [r7, #-1]
 ; CHECK-NEXT:    smlabb r4, r4, r9, r2
 ; CHECK-NEXT:    str r4, [r5, #-8]
@@ -672,11 +672,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl
 ; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    le lr, .LBB7_7
 ; CHECK-NEXT:  .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT:    wls lr, r8, .LBB7_11
+; CHECK-NEXT:    wls lr, r12, .LBB7_11
 ; CHECK-NEXT:  @ %bb.9: @ %for.body.epil.preheader
-; CHECK-NEXT:    add r0, r12
-; CHECK-NEXT:    add r1, r12
-; CHECK-NEXT:    add.w r3, r3, r12, lsl #2
+; CHECK-NEXT:    add r0, r8
+; CHECK-NEXT:    add r1, r8
+; CHECK-NEXT:    add.w r3, r3, r8, lsl #2
 ; CHECK-NEXT:  .LBB7_10: @ %for.body.epil
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb r7, [r0], #1
@@ -880,14 +880,14 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly
 ; CHECK-LABEL: test_vec_mul_scalar_add_int:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    ldr.w r12, [sp, #28]
-; CHECK-NEXT:    cmp.w r12, #0
+; CHECK-NEXT:    ldr r4, [sp, #28]
+; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    beq.w .LBB9_11
 ; CHECK-NEXT:  @ %bb.1: @ %vector.memcheck
-; CHECK-NEXT:    add.w r7, r1, r12, lsl #2
-; CHECK-NEXT:    add.w r6, r3, r12, lsl #2
+; CHECK-NEXT:    add.w r7, r1, r4, lsl #2
+; CHECK-NEXT:    add.w r6, r3, r4, lsl #2
 ; CHECK-NEXT:    cmp r7, r3
-; CHECK-NEXT:    add.w r5, r0, r12, lsl #2
+; CHECK-NEXT:    add.w r5, r0, r4, lsl #2
 ; CHECK-NEXT:    cset r7, hi
 ; CHECK-NEXT:    cmp r6, r1
 ; CHECK-NEXT:    csel r7, zr, r7, ls
@@ -900,15 +900,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly
 ; CHECK-NEXT:    cmpeq r7, #0
 ; CHECK-NEXT:    beq .LBB9_4
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT:    and r8, r12, #3
-; CHECK-NEXT:    sub.w r7, r12, #1
+; CHECK-NEXT:    and r12, r4, #3
+; CHECK-NEXT:    subs r7, r4, #1
 ; CHECK-NEXT:    cmp r7, #3
 ; CHECK-NEXT:    bhs .LBB9_6
 ; CHECK-NEXT:  @ %bb.3:
-; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    b .LBB9_8
 ; CHECK-NEXT:  .LBB9_4: @ %vector.ph
-; CHECK-NEXT:    dlstp.32 lr, r12
+; CHECK-NEXT:    dlstp.32 lr, r4
 ; CHECK-NEXT:  .LBB9_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
@@ -918,18 +918,18 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly
 ; CHECK-NEXT:    letp lr, .LBB9_5
 ; CHECK-NEXT:    b .LBB9_11
 ; CHECK-NEXT:  .LBB9_6: @ %for.body.preheader.new
-; CHECK-NEXT:    bic r7, r12, #3
+; CHECK-NEXT:    bic r7, r4, #3
 ; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    subs r7, #4
 ; CHECK-NEXT:    add.w r5, r3, #8
-; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
 ; CHECK-NEXT:    add.w r6, r0, #8
 ; CHECK-NEXT:    add.w r7, r1, #8
 ; CHECK-NEXT:  .LBB9_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r9, [r6, #-8]
-; CHECK-NEXT:    add.w r12, r12, #4
+; CHECK-NEXT:    add.w r8, r8, #4
 ; CHECK-NEXT:    ldr r4, [r7, #-8]
 ; CHECK-NEXT:    mla r4, r4, r9, r2
 ; CHECK-NEXT:    str r4, [r5, #-8]
@@ -950,11 +950,11 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly
 ; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    le lr, .LBB9_7
 ; CHECK-NEXT:  .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT:    wls lr, r8, .LBB9_11
+; CHECK-NEXT:    wls lr, r12, .LBB9_11
 ; CHECK-NEXT:  @ %bb.9: @ %for.body.epil.preheader
-; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
-; CHECK-NEXT:    add.w r1, r1, r12, lsl #2
-; CHECK-NEXT:    add.w r3, r3, r12, lsl #2
+; CHECK-NEXT:    add.w r0, r0, r8, lsl #2
+; CHECK-NEXT:    add.w r1, r1, r8, lsl #2
+; CHECK-NEXT:    add.w r3, r3, r8, lsl #2
 ; CHECK-NEXT:  .LBB9_10: @ %for.body.epil
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r7, [r0], #4

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
index 7acc83343dcb8..a0e690212d5a4 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
@@ -17,18 +17,18 @@ define dso_local void @check_option(ptr noalias nocapture %A, ptr noalias nocapt
 ; ENABLED-NEXT:  .LBB0_2: @ %vector.ph
 ; ENABLED-NEXT:    @ =>This Loop Header: Depth=1
 ; ENABLED-NEXT:    @ Child Loop BB0_3 Depth 2
-; ENABLED-NEXT:    mov r4, r0
-; ENABLED-NEXT:    mov r5, r2
-; ENABLED-NEXT:    mov r6, r1
-; ENABLED-NEXT:    mov r7, r3
-; ENABLED-NEXT:    dlstp.32 lr, r7
+; ENABLED-NEXT:    mov r12, r0
+; ENABLED-NEXT:    mov r4, r2
+; ENABLED-NEXT:    mov r5, r1
+; ENABLED-NEXT:    mov r6, r3
+; ENABLED-NEXT:    dlstp.32 lr, r6
 ; ENABLED-NEXT:  .LBB0_3: @ %vector.body
 ; ENABLED-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
-; ENABLED-NEXT:    vldrw.u32 q0, [r6], #16
-; ENABLED-NEXT:    vldrw.u32 q1, [r5], #16
+; ENABLED-NEXT:    vldrw.u32 q0, [r5], #16
+; ENABLED-NEXT:    vldrw.u32 q1, [r4], #16
 ; ENABLED-NEXT:    vadd.i32 q0, q1, q0
-; ENABLED-NEXT:    vstrw.32 q0, [r4], #16
+; ENABLED-NEXT:    vstrw.32 q0, [r12], #16
 ; ENABLED-NEXT:    letp lr, .LBB0_3
 ; ENABLED-NEXT:    b .LBB0_2
 ; ENABLED-NEXT:  .LBB0_4: @ %for.cond.cleanup
@@ -44,29 +44,29 @@ define dso_local void @check_option(ptr noalias nocapture %A, ptr noalias nocapt
 ; DISABLED-NEXT:    movs r6, #1
 ; DISABLED-NEXT:    bic r7, r7, #3
 ; DISABLED-NEXT:    subs r7, #4
-; DISABLED-NEXT:    add.w r12, r6, r7, lsr #2
+; DISABLED-NEXT:    add.w r8, r6, r7, lsr #2
 ; DISABLED-NEXT:  .LBB0_2: @ %vector.ph
 ; DISABLED-NEXT:    @ =>This Loop Header: Depth=1
 ; DISABLED-NEXT:    @ Child Loop BB0_3 Depth 2
-; DISABLED-NEXT:    mov r8, r12
-; DISABLED-NEXT:    mov r4, r0
-; DISABLED-NEXT:    mov r5, r2
-; DISABLED-NEXT:    mov r6, r1
-; DISABLED-NEXT:    mov r7, r3
-; DISABLED-NEXT:    dls lr, r12
+; DISABLED-NEXT:    mov r7, r8
+; DISABLED-NEXT:    mov r12, r0
+; DISABLED-NEXT:    mov r4, r2
+; DISABLED-NEXT:    mov r5, r1
+; DISABLED-NEXT:    mov r6, r3
+; DISABLED-NEXT:    dls lr, r8
 ; DISABLED-NEXT:  .LBB0_3: @ %vector.body
 ; DISABLED-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; DISABLED-NEXT:    @ => This Inner Loop Header: Depth=2
-; DISABLED-NEXT:    vctp.32 r7
-; DISABLED-NEXT:    mov lr, r8
+; DISABLED-NEXT:    vctp.32 r6
+; DISABLED-NEXT:    mov lr, r7
 ; DISABLED-NEXT:    vpstt
-; DISABLED-NEXT:    vldrwt.u32 q0, [r6], #16
-; DISABLED-NEXT:    vldrwt.u32 q1, [r5], #16
-; DISABLED-NEXT:    sub.w r8, r8, #1
-; DISABLED-NEXT:    subs r7, #4
+; DISABLED-NEXT:    vldrwt.u32 q0, [r5], #16
+; DISABLED-NEXT:    vldrwt.u32 q1, [r4], #16
+; DISABLED-NEXT:    subs r7, #1
+; DISABLED-NEXT:    subs r6, #4
 ; DISABLED-NEXT:    vadd.i32 q0, q1, q0
 ; DISABLED-NEXT:    vpst
-; DISABLED-NEXT:    vstrwt.32 q0, [r4], #16
+; DISABLED-NEXT:    vstrwt.32 q0, [r12], #16
 ; DISABLED-NEXT:    le lr, .LBB0_3
 ; DISABLED-NEXT:    b .LBB0_2
 ; DISABLED-NEXT:  .LBB0_4: @ %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 736d5956b6194..07c06e10979cd 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -29,7 +29,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    mov.w r8, #0
 ; ENABLED-NEXT:    mov r9, r12
 ; ENABLED-NEXT:    uxth r0, r0
-; ENABLED-NEXT:    rsbs r6, r0, #0
+; ENABLED-NEXT:    rsbs r5, r0, #0
 ; ENABLED-NEXT:    b .LBB0_4
 ; ENABLED-NEXT:  .LBB0_2: @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    movs r0, #0
@@ -52,9 +52,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    bic r0, r9, #3
 ; ENABLED-NEXT:    movs r7, #1
 ; ENABLED-NEXT:    subs r0, #4
-; ENABLED-NEXT:    sub.w r5, r2, r8
+; ENABLED-NEXT:    sub.w r4, r2, r8
 ; ENABLED-NEXT:    vmov.i32 q1, #0x0
-; ENABLED-NEXT:    add.w r4, r7, r0, lsr #2
+; ENABLED-NEXT:    add.w r6, r7, r0, lsr #2
 ; ENABLED-NEXT:    sub.w r0, r12, r8
 ; ENABLED-NEXT:    bic r0, r0, #3
 ; ENABLED-NEXT:    subs r0, #4
@@ -65,16 +65,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:  .LBB0_6: @ %vector.body
 ; ENABLED-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
-; ENABLED-NEXT:    vctp.32 r5
+; ENABLED-NEXT:    vctp.32 r4
 ; ENABLED-NEXT:    vmov q0, q1
 ; ENABLED-NEXT:    vpstt
 ; ENABLED-NEXT:    vldrht.s32 q1, [r0], #8
 ; ENABLED-NEXT:    vldrht.s32 q2, [r7], #8
-; ENABLED-NEXT:    mov lr, r4
-; ENABLED-NEXT:    subs r4, #1
+; ENABLED-NEXT:    mov lr, r6
+; ENABLED-NEXT:    subs r6, #1
 ; ENABLED-NEXT:    vmul.i32 q1, q2, q1
-; ENABLED-NEXT:    subs r5, #4
-; ENABLED-NEXT:    vshl.s32 q1, r6
+; ENABLED-NEXT:    subs r4, #4
+; ENABLED-NEXT:    vshl.s32 q1, r5
 ; ENABLED-NEXT:    vadd.i32 q1, q1, q0
 ; ENABLED-NEXT:    le lr, .LBB0_6
 ; ENABLED-NEXT:  @ %bb.7: @ %middle.block
@@ -100,7 +100,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    mov.w r8, #0
 ; NOREDUCTIONS-NEXT:    mov r9, r12
 ; NOREDUCTIONS-NEXT:    uxth r0, r0
-; NOREDUCTIONS-NEXT:    rsbs r6, r0, #0
+; NOREDUCTIONS-NEXT:    rsbs r5, r0, #0
 ; NOREDUCTIONS-NEXT:    b .LBB0_4
 ; NOREDUCTIONS-NEXT:  .LBB0_2: @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    movs r0, #0
@@ -123,9 +123,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    bic r0, r9, #3
 ; NOREDUCTIONS-NEXT:    movs r7, #1
 ; NOREDUCTIONS-NEXT:    subs r0, #4
-; NOREDUCTIONS-NEXT:    sub.w r5, r2, r8
+; NOREDUCTIONS-NEXT:    sub.w r4, r2, r8
 ; NOREDUCTIONS-NEXT:    vmov.i32 q1, #0x0
-; NOREDUCTIONS-NEXT:    add.w r4, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT:    add.w r6, r7, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    sub.w r0, r12, r8
 ; NOREDUCTIONS-NEXT:    bic r0, r0, #3
 ; NOREDUCTIONS-NEXT:    subs r0, #4
@@ -136,16 +136,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:  .LBB0_6: @ %vector.body
 ; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    @ => This Inner Loop Header: Depth=2
-; NOREDUCTIONS-NEXT:    vctp.32 r5
+; NOREDUCTIONS-NEXT:    vctp.32 r4
 ; NOREDUCTIONS-NEXT:    vmov q0, q1
 ; NOREDUCTIONS-NEXT:    vpstt
 ; NOREDUCTIONS-NEXT:    vldrht.s32 q1, [r0], #8
 ; NOREDUCTIONS-NEXT:    vldrht.s32 q2, [r7], #8
-; NOREDUCTIONS-NEXT:    mov lr, r4
-; NOREDUCTIONS-NEXT:    subs r4, #1
+; NOREDUCTIONS-NEXT:    mov lr, r6
+; NOREDUCTIONS-NEXT:    subs r6, #1
 ; NOREDUCTIONS-NEXT:    vmul.i32 q1, q2, q1
-; NOREDUCTIONS-NEXT:    subs r5, #4
-; NOREDUCTIONS-NEXT:    vshl.s32 q1, r6
+; NOREDUCTIONS-NEXT:    subs r4, #4
+; NOREDUCTIONS-NEXT:    vshl.s32 q1, r5
 ; NOREDUCTIONS-NEXT:    vadd.i32 q1, q1, q0
 ; NOREDUCTIONS-NEXT:    le lr, .LBB0_6
 ; NOREDUCTIONS-NEXT:  @ %bb.7: @ %middle.block

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
index a6a9361050731..cbcbf1f392ce8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -165,69 +165,74 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" {
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    wls lr, r1, .LBB2_3
 ; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT:    add.w r12, r3, #4
-; CHECK-NEXT:    add.w r9, r0, #4
-; CHECK-NEXT:    mvn r10, #1
-; CHECK-NEXT:    @ implicit-def: $r6
-; CHECK-NEXT:    @ implicit-def: $r8
+; CHECK-NEXT:    adds r6, r3, #4
+; CHECK-NEXT:    adds r1, r0, #4
+; CHECK-NEXT:    mvn r8, #1
+; CHECK-NEXT:    @ implicit-def: $r9
+; CHECK-NEXT:    @ implicit-def: $r4
 ; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB2_2: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr.w r1, [r9]
-; CHECK-NEXT:    asr.w r2, r8, #31
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    asrs r2, r4, #31
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [r1]
 ; CHECK-NEXT:    muls r1, r3, r1
-; CHECK-NEXT:    adds.w r5, r8, r1
+; CHECK-NEXT:    adds r4, r4, r1
 ; CHECK-NEXT:    adc.w r1, r2, r1, asr #31
-; CHECK-NEXT:    adds.w r2, r5, #-2147483648
-; CHECK-NEXT:    ldrd r2, r5, [r10]
-; CHECK-NEXT:    adc r8, r1, #0
-; CHECK-NEXT:    asr.w r1, r8, #31
-; CHECK-NEXT:    strd r6, r2, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    smull r5, r6, r5, r6
-; CHECK-NEXT:    subs.w r5, r8, r5
-; CHECK-NEXT:    sbcs r1, r6
-; CHECK-NEXT:    adds.w r6, r5, #-2147483648
+; CHECK-NEXT:    adds.w r2, r4, #-2147483648
+; CHECK-NEXT:    ldrd r2, r4, [r8]
 ; CHECK-NEXT:    adc r5, r1, #0
-; CHECK-NEXT:    ldr r1, [r9, #-4]
-; CHECK-NEXT:    add.w r9, r9, #4
-; CHECK-NEXT:    muls r1, r3, r1
-; CHECK-NEXT:    adds.w r2, r1, #-2147483648
-; CHECK-NEXT:    asr.w r4, r1, #31
-; CHECK-NEXT:    ldr.w r1, [r12]
-; CHECK-NEXT:    adc r3, r4, #0
-; CHECK-NEXT:    mul r4, r1, r0
-; CHECK-NEXT:    adds r0, #4
-; CHECK-NEXT:    add.w r4, r4, #-2147483648
-; CHECK-NEXT:    asrl r2, r3, r4
-; CHECK-NEXT:    smull r2, r3, r1, r2
-; CHECK-NEXT:    lsll r2, r3, #30
-; CHECK-NEXT:    asr.w r11, r3, #31
-; CHECK-NEXT:    mov r4, r3
-; CHECK-NEXT:    ldrd r3, r2, [sp, #4] @ 8-byte Folded Reload
-; CHECK-NEXT:    lsll r4, r11, r1
-; CHECK-NEXT:    lsrl r4, r11, #2
-; CHECK-NEXT:    muls r3, r2, r3
-; CHECK-NEXT:    adds r3, #2
-; CHECK-NEXT:    lsll r4, r11, r3
-; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    add.w r2, r4, #-2147483648
-; CHECK-NEXT:    asrl r6, r5, r2
-; CHECK-NEXT:    movs r2, #2
-; CHECK-NEXT:    lsrl r6, r5, #2
+; CHECK-NEXT:    str r2, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    smull r4, r2, r4, r9
+; CHECK-NEXT:    asrs r1, r5, #31
+; CHECK-NEXT:    str r5, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    subs r4, r5, r4
+; CHECK-NEXT:    sbcs r1, r2
+; CHECK-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    adds.w r10, r4, #-2147483648
+; CHECK-NEXT:    adc r1, r1, #0
+; CHECK-NEXT:    ldr r4, [r2, #-4]
+; CHECK-NEXT:    muls r4, r3, r4
 ; CHECK-NEXT:    adds r3, #4
-; CHECK-NEXT:    str r6, [r2]
-; CHECK-NEXT:    ldr r2, [r10], #-4
-; CHECK-NEXT:    mls r4, r2, r1, r8
+; CHECK-NEXT:    adds.w r12, r4, #-2147483648
+; CHECK-NEXT:    asr.w r5, r4, #31
+; CHECK-NEXT:    ldr r4, [r6]
+; CHECK-NEXT:    adc r5, r5, #0
+; CHECK-NEXT:    mul r2, r4, r0
+; CHECK-NEXT:    adds r0, #4
+; CHECK-NEXT:    add.w r2, r2, #-2147483648
+; CHECK-NEXT:    asrl r12, r5, r2
+; CHECK-NEXT:    smull r2, r5, r4, r12
+; CHECK-NEXT:    lsll r2, r5, #30
+; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    asr.w r11, r5, #31
+; CHECK-NEXT:    mov r12, r5
+; CHECK-NEXT:    lsll r12, r11, r4
+; CHECK-NEXT:    mul r2, r2, r9
+; CHECK-NEXT:    lsrl r12, r11, #2
+; CHECK-NEXT:    adds r2, #2
+; CHECK-NEXT:    lsll r12, r11, r2
 ; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
-; CHECK-NEXT:    adds.w r8, r4, #-2147483648
-; CHECK-NEXT:    asr.w r1, r4, #31
+; CHECK-NEXT:    add.w r5, r12, #-2147483648
+; CHECK-NEXT:    asrl r10, r1, r5
+; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    lsrl r10, r1, #2
+; CHECK-NEXT:    movs r1, #2
+; CHECK-NEXT:    mov r9, r10
+; CHECK-NEXT:    str.w r10, [r1]
+; CHECK-NEXT:    ldr r1, [r8], #-4
+; CHECK-NEXT:    mls r5, r1, r4, r5
+; CHECK-NEXT:    adds.w r4, r5, #-2147483648
+; CHECK-NEXT:    asr.w r1, r5, #31
 ; CHECK-NEXT:    adc r1, r1, #0
-; CHECK-NEXT:    lsrl r8, r1, #2
-; CHECK-NEXT:    rsb.w r1, r8, #0
+; CHECK-NEXT:    lsrl r4, r1, #2
+; CHECK-NEXT:    rsbs r1, r4, #0
 ; CHECK-NEXT:    str r1, [r2]
-; CHECK-NEXT:    str r1, [r12, #-4]
-; CHECK-NEXT:    add.w r12, r12, #4
+; CHECK-NEXT:    str r1, [r6, #-4]
+; CHECK-NEXT:    adds r6, #4
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, #4
 ; CHECK-NEXT:    le lr, .LBB2_2
 ; CHECK-NEXT:  .LBB2_3: @ %while.end
 ; CHECK-NEXT:    add sp, #16

diff  --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index edbbbf25aab0a..d076cb00ad7e0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -355,8 +355,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    mov lr, r0
 ; CHECK-NEXT:    subs r0, #1
 ; CHECK-NEXT:    sbcs r0, r1, #0
@@ -375,7 +375,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    movw r2, #43691
 ; CHECK-NEXT:    adds r1, #2
 ; CHECK-NEXT:    movt r2, #43690
-; CHECK-NEXT:    ldr r6, [sp, #120]
+; CHECK-NEXT:    ldr r6, [sp, #128]
 ; CHECK-NEXT:    movw r8, :lower16:c
 ; CHECK-NEXT:    umull r1, r2, r1, r2
 ; CHECK-NEXT:    movt r8, :upper16:c
@@ -384,6 +384,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    @ implicit-def: $r5
 ; CHECK-NEXT:    @ implicit-def: $r11
 ; CHECK-NEXT:    mov.w r9, #12
+; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    add.w r1, r1, r2, lsr #1
 ; CHECK-NEXT:    add.w r0, r0, r2, lsr #1
 ; CHECK-NEXT:    bic r3, r1, #3
@@ -394,7 +395,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    vdup.32 q6, r0
 ; CHECK-NEXT:    vadd.i32 q4, q0, r7
 ; CHECK-NEXT:    vdup.32 q7, r0
-; CHECK-NEXT:    strd r3, r7, [sp] @ 8-byte Folded Spill
+; CHECK-NEXT:    strd r3, r7, [sp, #4] @ 8-byte Folded Spill
 ; CHECK-NEXT:    b .LBB1_6
 ; CHECK-NEXT:  .LBB1_2: @ %for.body6.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
@@ -443,19 +444,21 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    bhi .LBB1_17
 ; CHECK-NEXT:  @ %bb.8: @ %for.body6.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT:    ldrd r2, r3, [sp, #112]
+; CHECK-NEXT:    ldrd r2, r3, [sp, #120]
 ; CHECK-NEXT:    movs r0, #32
 ; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    mov r4, r6
 ; CHECK-NEXT:    mov r7, r12
 ; CHECK-NEXT:    mov r6, lr
 ; CHECK-NEXT:    bl __aeabi_ldivmod
 ; CHECK-NEXT:    mov lr, r6
+; CHECK-NEXT:    mov r6, r4
 ; CHECK-NEXT:    mov r12, r7
-; CHECK-NEXT:    ldr r3, [sp] @ 4-byte Reload
+; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r4, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vdup.32 q0, r2
-; CHECK-NEXT:    ldr r6, [sp, #120]
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    mov r0, r11
-; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    b .LBB1_10
 ; CHECK-NEXT:  .LBB1_9: @ %for.cond.cleanup17.us
 ; CHECK-NEXT:    @ in Loop: Header=BB1_10 Depth=2
@@ -570,7 +573,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    b .LBB1_27
 ; CHECK-NEXT:  .LBB1_28: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 4c0ded4515b65..28166e455aba2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -999,7 +999,7 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    ldrh r6, [r0]
 ; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    ldrd r4, r9, [r0, #4]
+; CHECK-NEXT:    ldrd r4, r10, [r0, #4]
 ; CHECK-NEXT:    sub.w r0, r6, #8
 ; CHECK-NEXT:    add.w r3, r0, r0, lsr #29
 ; CHECK-NEXT:    and r0, r0, #7
@@ -1008,11 +1008,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    it gt
 ; CHECK-NEXT:    asrgt r5, r3, #3
 ; CHECK-NEXT:    add.w r3, r4, r6, lsl #2
-; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    str r3, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    sub.w r9, r3, #4
 ; CHECK-NEXT:    rsbs r3, r6, #0
 ; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    add.w r3, r9, #32
+; CHECK-NEXT:    add.w r3, r10, #32
 ; CHECK-NEXT:    str r5, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
@@ -1025,7 +1024,8 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    b .LBB16_5
 ; CHECK-NEXT:  .LBB16_4: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldrd r0, r1, [sp, #20] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
 ; CHECK-NEXT:    wls lr, r0, .LBB16_5
 ; CHECK-NEXT:    b .LBB16_10
 ; CHECK-NEXT:  .LBB16_5: @ %while.end
@@ -1040,17 +1040,15 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
+; CHECK-NEXT:    add.w lr, r10, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
-; CHECK-NEXT:    add.w lr, r9, #8
-; CHECK-NEXT:    ldrd r3, r7, [r9]
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    ldm.w lr, {r0, r5, lr}
-; CHECK-NEXT:    ldrd r10, r11, [r9, #20]
-; CHECK-NEXT:    ldr.w r8, [r9, #28]
-; CHECK-NEXT:    vstrb.8 q0, [r1], #16
+; CHECK-NEXT:    ldrd r3, r7, [r10]
+; CHECK-NEXT:    ldm.w lr, {r0, r5, r6, lr}
+; CHECK-NEXT:    ldrd r11, r8, [r10, #24]
+; CHECK-NEXT:    vstrb.8 q0, [r9], #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r4], #32
 ; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    str.w r9, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r3
 ; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
@@ -1061,9 +1059,9 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r5
 ; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
-; CHECK-NEXT:    vfma.f32 q0, q5, lr
+; CHECK-NEXT:    vfma.f32 q0, q5, r6
 ; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
-; CHECK-NEXT:    vfma.f32 q0, q2, r10
+; CHECK-NEXT:    vfma.f32 q0, q2, lr
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    vfma.f32 q0, q1, r8
@@ -1077,26 +1075,25 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:  .LBB16_8: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldm.w r7, {r0, r3, r5, r6}
+; CHECK-NEXT:    ldm.w r7, {r0, r3, r5, r6, r8, r11}
 ; CHECK-NEXT:    vldrw.u32 q1, [r4], #32
-; CHECK-NEXT:    add.w r11, r7, #16
 ; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
 ; CHECK-NEXT:    vldrw.u32 q4, [r4, #-20]
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
-; CHECK-NEXT:    ldm.w r11, {r1, r8, r10, r11}
 ; CHECK-NEXT:    vldrw.u32 q5, [r4, #-16]
-; CHECK-NEXT:    vfma.f32 q0, q1, r3
 ; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT:    vfma.f32 q0, q1, r3
+; CHECK-NEXT:    ldrd r9, r1, [r7, #24]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r5
 ; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r6
 ; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
-; CHECK-NEXT:    vfma.f32 q0, q5, r1
+; CHECK-NEXT:    vfma.f32 q0, q5, r8
 ; CHECK-NEXT:    adds r7, #32
-; CHECK-NEXT:    vfma.f32 q0, q2, r8
-; CHECK-NEXT:    vfma.f32 q0, q3, r10
-; CHECK-NEXT:    vfma.f32 q0, q1, r11
+; CHECK-NEXT:    vfma.f32 q0, q2, r11
+; CHECK-NEXT:    vfma.f32 q0, q3, r9
+; CHECK-NEXT:    vfma.f32 q0, q1, r1
 ; CHECK-NEXT:    le lr, .LBB16_8
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
index 8fe310bd3d5e3..652d25af02e7c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
@@ -180,15 +180,15 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef
 ; CHECK-NEXT:    @ in Loop: Header=BB4_4 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r2, r9, r10
-; CHECK-NEXT:    add.w r5, r1, r9, lsl #1
+; CHECK-NEXT:    add.w r7, r1, r9, lsl #1
 ; CHECK-NEXT:    add.w r2, r1, r2, lsl #1
-; CHECK-NEXT:    sub.w r7, r8, r9
-; CHECK-NEXT:    dlstp.32 lr, r7
+; CHECK-NEXT:    sub.w r5, r8, r9
+; CHECK-NEXT:    dlstp.32 lr, r5
 ; CHECK-NEXT:  .LBB4_11: @ %vec.epilog.vector.body
 ; CHECK-NEXT:    @ Parent Loop BB4_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    rsbs r4, r3, #0
-; CHECK-NEXT:    vldrh.s32 q0, [r5], #8
+; CHECK-NEXT:    vldrh.s32 q0, [r7], #8
 ; CHECK-NEXT:    vldrh.s32 q1, [r2], #8
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vshl.s32 q0, r4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
index 22deb23cad27e..da59cb259db61 100644
--- a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
@@ -548,43 +548,44 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    letp lr, .LBB19_1
 ; CHECK-NEXT:  .LBB19_2: @ %entry
 ; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    movw r11, :lower16:arr_20
-; CHECK-NEXT:    adr r6, .LCPI19_0
-; CHECK-NEXT:    movt r11, :upper16:arr_20
+; CHECK-NEXT:    movw r6, :lower16:arr_20
+; CHECK-NEXT:    movt r6, :upper16:arr_20
+; CHECK-NEXT:    add.w r3, r6, #80
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    movw r0, :lower16:arr_21
 ; CHECK-NEXT:    movt r0, :upper16:arr_21
 ; CHECK-NEXT:    add.w r5, r0, #36
-; CHECK-NEXT:    add.w r3, r11, #80
-; CHECK-NEXT:    add.w r9, r11, #128
-; CHECK-NEXT:    add.w r7, r11, #112
-; CHECK-NEXT:    add.w r2, r11, #96
-; CHECK-NEXT:    add.w r4, r11, #64
-; CHECK-NEXT:    add.w r0, r11, #48
-; CHECK-NEXT:    add.w r1, r11, #32
-; CHECK-NEXT:    add.w r12, r11, #16
-; CHECK-NEXT:    mov.w r8, #327685
+; CHECK-NEXT:    add.w r11, r6, #128
+; CHECK-NEXT:    add.w r7, r6, #112
+; CHECK-NEXT:    add.w r2, r6, #96
+; CHECK-NEXT:    add.w r4, r6, #64
+; CHECK-NEXT:    add.w r0, r6, #48
+; CHECK-NEXT:    add.w r1, r6, #32
+; CHECK-NEXT:    add.w r12, r6, #16
+; CHECK-NEXT:    adr r6, .LCPI19_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
-; CHECK-NEXT:    mov r6, r8
-; CHECK-NEXT:    mov.w r10, #5
+; CHECK-NEXT:    movw r6, :lower16:arr_20
+; CHECK-NEXT:    mov.w r8, #327685
+; CHECK-NEXT:    mov.w r9, #5
 ; CHECK-NEXT:    vmov.i16 q1, #0x5
-; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    movt r6, :upper16:arr_20
 ; CHECK-NEXT:  .LBB19_3: @ %for.cond8.preheader
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str r6, [r5, #-4]
+; CHECK-NEXT:    str r8, [r5, #-4]
 ; CHECK-NEXT:    vstrh.16 q1, [r5, #-36]
-; CHECK-NEXT:    strh.w r10, [r5]
+; CHECK-NEXT:    strh.w r9, [r5]
 ; CHECK-NEXT:    vstrh.16 q1, [r5, #-20]
 ; CHECK-NEXT:    vstrw.32 q0, [r3]
 ; CHECK-NEXT:    vstrh.16 q0, [r12], #152
-; CHECK-NEXT:    vstrh.16 q0, [r11], #152
+; CHECK-NEXT:    vstrh.16 q0, [r6], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r1], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r0], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r4], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r2], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r7], #152
-; CHECK-NEXT:    vstrh.16 q0, [r9], #152
-; CHECK-NEXT:    strd r10, r8, [r3, #64]
+; CHECK-NEXT:    vstrh.16 q0, [r11], #152
+; CHECK-NEXT:    strd r9, r10, [r3, #64]
 ; CHECK-NEXT:    adds r5, #38
 ; CHECK-NEXT:    adds r3, #152
 ; CHECK-NEXT:    le lr, .LBB19_3
@@ -600,46 +601,46 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    vstrb.8 q1, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB19_5
 ; CHECK-NEXT:  .LBB19_6: @ %for.cond.cleanup6
-; CHECK-NEXT:    movw r2, :lower16:arr_20
+; CHECK-NEXT:    movw r6, :lower16:arr_20
 ; CHECK-NEXT:    movw r0, #7376
-; CHECK-NEXT:    movt r2, :upper16:arr_20
-; CHECK-NEXT:    adds r3, r2, r0
+; CHECK-NEXT:    movt r6, :upper16:arr_20
+; CHECK-NEXT:    adds r3, r6, r0
 ; CHECK-NEXT:    movw r0, #7408
-; CHECK-NEXT:    add.w r12, r2, r0
+; CHECK-NEXT:    add.w r12, r6, r0
 ; CHECK-NEXT:    movw r0, #7344
-; CHECK-NEXT:    add.w r11, r2, r0
+; CHECK-NEXT:    add.w r9, r6, r0
 ; CHECK-NEXT:    movw r0, #7312
-; CHECK-NEXT:    add.w r9, r2, r0
+; CHECK-NEXT:    adds r2, r6, r0
 ; CHECK-NEXT:    movw r0, :lower16:arr_21
-; CHECK-NEXT:    add.w r1, r2, #7424
-; CHECK-NEXT:    add.w r7, r2, #7392
-; CHECK-NEXT:    add.w r4, r2, #7360
-; CHECK-NEXT:    add.w r5, r2, #7328
-; CHECK-NEXT:    add.w r6, r2, #7296
-; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    add.w r1, r6, #7424
+; CHECK-NEXT:    add.w r7, r6, #7392
+; CHECK-NEXT:    add.w r4, r6, #7360
+; CHECK-NEXT:    add.w r5, r6, #7328
+; CHECK-NEXT:    add.w r8, r6, #7296
+; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    movt r0, :upper16:arr_21
 ; CHECK-NEXT:    addw r0, r0, #1860
 ; CHECK-NEXT:    mov.w r10, #5
-; CHECK-NEXT:    dls lr, r2
-; CHECK-NEXT:    mov.w r2, #327685
+; CHECK-NEXT:    dls lr, r6
+; CHECK-NEXT:    mov.w r6, #327685
 ; CHECK-NEXT:    vmov.i16 q1, #0x5
-; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    mov.w r11, #0
 ; CHECK-NEXT:  .LBB19_7: @ %for.cond8.preheader.1
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str r2, [r0, #-4]
+; CHECK-NEXT:    str r6, [r0, #-4]
 ; CHECK-NEXT:    vstrh.16 q1, [r0, #-36]
 ; CHECK-NEXT:    strh.w r10, [r0]
 ; CHECK-NEXT:    vstrh.16 q1, [r0, #-20]
 ; CHECK-NEXT:    vstrw.32 q0, [r3]
-; CHECK-NEXT:    vstrh.16 q0, [r9], #152
-; CHECK-NEXT:    vstrh.16 q0, [r6], #152
+; CHECK-NEXT:    vstrh.16 q0, [r2], #152
+; CHECK-NEXT:    vstrh.16 q0, [r8], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r5], #152
-; CHECK-NEXT:    vstrh.16 q0, [r11], #152
+; CHECK-NEXT:    vstrh.16 q0, [r9], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r4], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r7], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r12], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r1], #152
-; CHECK-NEXT:    strd r10, r8, [r3, #64]
+; CHECK-NEXT:    strd r10, r11, [r3, #64]
 ; CHECK-NEXT:    adds r0, #38
 ; CHECK-NEXT:    adds r3, #152
 ; CHECK-NEXT:    le lr, .LBB19_7
@@ -662,7 +663,7 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    movw r0, #14704
 ; CHECK-NEXT:    add.w r12, r7, r0
 ; CHECK-NEXT:    movw r0, #14688
-; CHECK-NEXT:    add.w r11, r7, r0
+; CHECK-NEXT:    add.w r8, r7, r0
 ; CHECK-NEXT:    movw r0, #14640
 ; CHECK-NEXT:    add.w r9, r7, r0
 ; CHECK-NEXT:    movw r0, #14624
@@ -680,7 +681,7 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:    mov.w r7, #327685
 ; CHECK-NEXT:    vmov.i16 q1, #0x5
-; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    mov.w r11, #0
 ; CHECK-NEXT:  .LBB19_11: @ %for.cond8.preheader.2
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    str r7, [r1, #-4]
@@ -693,10 +694,10 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    vstrh.16 q0, [r2], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r9], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r5], #152
-; CHECK-NEXT:    vstrh.16 q0, [r11], #152
+; CHECK-NEXT:    vstrh.16 q0, [r8], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r12], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r4], #152
-; CHECK-NEXT:    strd r10, r8, [r3, #64]
+; CHECK-NEXT:    strd r10, r11, [r3, #64]
 ; CHECK-NEXT:    adds r1, #38
 ; CHECK-NEXT:    adds r3, #152
 ; CHECK-NEXT:    le lr, .LBB19_11
@@ -720,9 +721,9 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    movt r7, :upper16:arr_20
 ; CHECK-NEXT:    add.w r12, r7, r1
 ; CHECK-NEXT:    movw r1, #21984
-; CHECK-NEXT:    add.w r10, r7, r1
-; CHECK-NEXT:    movw r1, #21952
 ; CHECK-NEXT:    add.w r8, r7, r1
+; CHECK-NEXT:    movw r1, #21952
+; CHECK-NEXT:    add.w r9, r7, r1
 ; CHECK-NEXT:    movw r1, #21936
 ; CHECK-NEXT:    movw r0, #21968
 ; CHECK-NEXT:    adds r5, r7, r1
@@ -734,7 +735,7 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    add.w r3, r7, #22016
 ; CHECK-NEXT:    add.w r6, r7, #21888
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    mov.w r9, #5
+; CHECK-NEXT:    mov.w r10, #5
 ; CHECK-NEXT:    vmov.i16 q1, #0x5
 ; CHECK-NEXT:    mov.w r11, #0
 ; CHECK-NEXT:    dls lr, r7
@@ -743,18 +744,18 @@ define i32 @reverted(i1 zeroext %b) {
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    str r7, [r2, #-4]
 ; CHECK-NEXT:    vstrh.16 q1, [r2, #-36]
-; CHECK-NEXT:    strh.w r9, [r2]
+; CHECK-NEXT:    strh.w r10, [r2]
 ; CHECK-NEXT:    vstrh.16 q1, [r2, #-20]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    vstrh.16 q0, [r4], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r6], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r1], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r5], #152
+; CHECK-NEXT:    vstrh.16 q0, [r9], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r8], #152
-; CHECK-NEXT:    vstrh.16 q0, [r10], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r12], #152
 ; CHECK-NEXT:    vstrh.16 q0, [r3], #152
-; CHECK-NEXT:    strd r9, r11, [r0, #64]
+; CHECK-NEXT:    strd r10, r11, [r0, #64]
 ; CHECK-NEXT:    adds r2, #38
 ; CHECK-NEXT:    adds r0, #152
 ; CHECK-NEXT:    le lr, .LBB19_15

diff  --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index 14ea3a3713224..dad856c0677a1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -14,8 +14,8 @@ define arm_aapcs_vfpcc void @k() {
 ; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    adr r5, .LCPI0_0
 ; CHECK-NEXT:    adr r4, .LCPI0_1
-; CHECK-NEXT:    vldrw.u32 q5, [r5]
-; CHECK-NEXT:    vldrw.u32 q6, [r4]
+; CHECK-NEXT:    vldrw.u32 q6, [r5]
+; CHECK-NEXT:    vldrw.u32 q5, [r4]
 ; CHECK-NEXT:    add r0, sp, #16
 ; CHECK-NEXT:    vmov.i32 q0, #0x1
 ; CHECK-NEXT:    vmov.i8 q1, #0x0
@@ -25,14 +25,14 @@ define arm_aapcs_vfpcc void @k() {
 ; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vand q6, q6, q0
 ; CHECK-NEXT:    vand q5, q5, q0
-; CHECK-NEXT:    vcmp.i32 eq, q6, zr
-; CHECK-NEXT:    vpsel q6, q2, q1
+; CHECK-NEXT:    vand q6, q6, q0
 ; CHECK-NEXT:    vcmp.i32 eq, q5, zr
 ; CHECK-NEXT:    vpsel q5, q2, q1
-; CHECK-NEXT:    vstrh.32 q6, [r0]
-; CHECK-NEXT:    vstrh.32 q5, [r0, #8]
+; CHECK-NEXT:    vcmp.i32 eq, q6, zr
+; CHECK-NEXT:    vpsel q6, q2, q1
+; CHECK-NEXT:    vstrh.32 q5, [r0]
+; CHECK-NEXT:    vstrh.32 q6, [r0, #8]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0]
 ; CHECK-NEXT:    vcmp.i16 ne, q5, zr
 ; CHECK-NEXT:    vmov.i32 q5, #0x0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
index ff5a27149cb2e..43ed5eefbf4c7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
@@ -17,16 +17,16 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    csel r7, r6, r5, hs
 ; CHECK-NEXT:    add.w lr, r7, #1
-; CHECK-NEXT:    mov r8, r5
+; CHECK-NEXT:    mov r4, r5
 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #32
 ; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    mov r6, r5
+; CHECK-NEXT:    mov r8, r5
 ; CHECK-NEXT:    vldrh.u16 q1, [r1], #32
-; CHECK-NEXT:    vmlsldava.s16 r8, r7, q0, q1
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q0, q1
 ; CHECK-NEXT:    vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT:    vmlaldavax.s16 r6, r5, q0, q1
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q0, q1
 ; CHECK-NEXT:    vldrh.u16 q3, [r1, #-16]
-; CHECK-NEXT:    vmlsldava.s16 r8, r7, q2, q3
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q2, q3
 ; CHECK-NEXT:    vldrh.u16 q0, [r1], #32
 ; CHECK-NEXT:    sub.w lr, lr, #1
 ; CHECK-NEXT:    cmp.w lr, #0
@@ -35,72 +35,75 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB0_2: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmlaldavax.s16 r6, r5, q2, q3
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q2, q3
 ; CHECK-NEXT:    vldrh.u16 q3, [r1, #-16]
-; CHECK-NEXT:    vmlsldava.s16 r8, r7, q1, q0
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q1, q0
 ; CHECK-NEXT:    vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT:    vmlaldavax.s16 r6, r5, q1, q0
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q1, q0
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #32
-; CHECK-NEXT:    vmlsldava.s16 r8, r7, q2, q3
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q2, q3
 ; CHECK-NEXT:    vldrh.u16 q0, [r1], #32
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    vmlaldavax.s16 r6, r5, q2, q3
-; CHECK-NEXT:    movs r4, #14
-; CHECK-NEXT:    and.w r2, r4, r2, lsl #1
-; CHECK-NEXT:    vmlaldavax.s16 r6, r5, q1, q0
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q2, q3
+; CHECK-NEXT:    movs r6, #14
+; CHECK-NEXT:    and.w r2, r6, r2, lsl #1
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q1, q0
 ; CHECK-NEXT:    vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT:    vmlsldava.s16 r8, r7, q1, q0
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q1, q0
 ; CHECK-NEXT:    vldrh.u16 q0, [r1, #-16]
-; CHECK-NEXT:    vmlaldavax.s16 r6, r5, q2, q0
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q2, q0
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmlsldava.s16 r8, r7, q2, q0
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q2, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrht.u16 q1, [r0]
 ; CHECK-NEXT:    cmp r2, #9
 ; CHECK-NEXT:    vpsttt
 ; CHECK-NEXT:    vldrht.u16 q0, [r1]
-; CHECK-NEXT:    vmlsldavat.s16 r8, r7, q1, q0
-; CHECK-NEXT:    vmlaldavaxt.s16 r6, r5, q1, q0
-; CHECK-NEXT:    blo .LBB0_9
+; CHECK-NEXT:    vmlsldavat.s16 r4, r7, q1, q0
+; CHECK-NEXT:    vmlaldavaxt.s16 r8, r5, q1, q0
+; CHECK-NEXT:    blo .LBB0_10
 ; CHECK-NEXT:  @ %bb.4: @ %do.body.1
 ; CHECK-NEXT:    subs r2, #8
 ; CHECK-NEXT:    vctp.16 r2
 ; CHECK-NEXT:    vpstttt
 ; CHECK-NEXT:    vldrht.u16 q0, [r0, #16]
 ; CHECK-NEXT:    vldrht.u16 q1, [r1, #16]
-; CHECK-NEXT:    vmlsldavat.s16 r8, r7, q0, q1
-; CHECK-NEXT:    vmlaldavaxt.s16 r6, r5, q0, q1
-; CHECK-NEXT:    b .LBB0_9
+; CHECK-NEXT:    vmlsldavat.s16 r4, r7, q0, q1
+; CHECK-NEXT:    vmlaldavaxt.s16 r8, r5, q0, q1
+; CHECK-NEXT:    b .LBB0_10
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB0_5: @ %if.else
-; CHECK-NEXT:    mov.w r8, #0
-; CHECK-NEXT:    cbz r2, .LBB0_8
+; CHECK-NEXT:    mov.w r4, #0
+; CHECK-NEXT:    cbz r2, .LBB0_9
 ; CHECK-NEXT:  @ %bb.6: @ %while.body14.preheader
-; CHECK-NEXT:    lsls r4, r2, #1
-; CHECK-NEXT:    mov r5, r8
-; CHECK-NEXT:    movs r6, #0
-; CHECK-NEXT:    mov r7, r8
-; CHECK-NEXT:    dlstp.16 lr, r4
+; CHECK-NEXT:    lsls r6, r2, #1
+; CHECK-NEXT:    mov r5, r4
+; CHECK-NEXT:    mov r7, r4
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    dlstp.16 lr, r6
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB0_7: @ %while.body14
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
 ; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
-; CHECK-NEXT:    vmlsldava.s16 r8, r7, q0, q1
-; CHECK-NEXT:    vmlaldavax.s16 r6, r5, q0, q1
+; CHECK-NEXT:    vmlsldava.s16 r2, r7, q0, q1
+; CHECK-NEXT:    vmlaldavax.s16 r4, r5, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB0_7
-; CHECK-NEXT:    b .LBB0_9
+; CHECK-NEXT:  @ %bb.8: @ %if.end.loopexit177
+; CHECK-NEXT:    mov r8, r4
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    b .LBB0_10
 ; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  .LBB0_8:
-; CHECK-NEXT:    mov r7, r8
-; CHECK-NEXT:    movs r6, #0
-; CHECK-NEXT:    mov r5, r8
-; CHECK-NEXT:  .LBB0_9: @ %if.end
-; CHECK-NEXT:    asrl r8, r7, #6
-; CHECK-NEXT:    asrl r6, r5, #6
-; CHECK-NEXT:    str.w r8, [r3]
-; CHECK-NEXT:    str.w r6, [r12]
+; CHECK-NEXT:  .LBB0_9:
+; CHECK-NEXT:    mov r7, r4
+; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    mov r5, r4
+; CHECK-NEXT:  .LBB0_10: @ %if.end
+; CHECK-NEXT:    asrl r4, r7, #6
+; CHECK-NEXT:    asrl r8, r5, #6
+; CHECK-NEXT:    str r4, [r3]
+; CHECK-NEXT:    str.w r8, [r12]
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %cmp = icmp ugt i32 %numSamples, 15

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index c987c4b537200..f90af3cc5ba24 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -845,12 +845,12 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  .LBB5_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB5_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    add.w r9, r3, r5
+; CHECK-NEXT:    add.w r12, r3, r5
 ; CHECK-NEXT:    vldrw.u32 q6, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q7, [r3], #16
-; CHECK-NEXT:    add.w r10, r9, r5
+; CHECK-NEXT:    add.w r10, r12, r5
 ; CHECK-NEXT:    vfma.f32 q4, q7, q6
-; CHECK-NEXT:    vldrw.u32 q7, [r9]
+; CHECK-NEXT:    vldrw.u32 q7, [r12]
 ; CHECK-NEXT:    add.w r6, r10, r5
 ; CHECK-NEXT:    vfma.f32 q5, q7, q6
 ; CHECK-NEXT:    vldrw.u32 q7, [r10]
@@ -1093,16 +1093,18 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vfmat.f32 q5, q0, q7
 ; CHECK-NEXT:    vldrwt.u32 q0, [r10]
 ; CHECK-NEXT:    add.w r6, r11, r5
-; CHECK-NEXT:    vstrw.32 q5, [sp, #40] @ 16-byte Spill
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q6, q0, q7
 ; CHECK-NEXT:    vldrwt.u32 q0, [r11]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vfmat.f32 q1, q0, q7
 ; CHECK-NEXT:    vmov q5, q4
 ; CHECK-NEXT:    vmov q4, q3
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q1, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r6]
 ; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
 ; CHECK-NEXT:    vldrw.u32 q1, [sp, #56] @ 16-byte Reload
 ; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vpstt
@@ -1120,7 +1122,8 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q4, q0, q7
 ; CHECK-NEXT:    vldrwt.u32 q0, [r7]
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT:    vmov q5, q6
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #40] @ 16-byte Reload
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q2, q0, q7
 ; CHECK-NEXT:    le lr, .LBB6_3

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
index bd0e5dabea3cf..096d4382d2c35 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
@@ -8,24 +8,23 @@ define i32 @vaddv(ptr nocapture readonly %data, i32 %N) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov lr, r1
 ; CHECK-NEXT:    cmp r1, #1
 ; CHECK-NEXT:    blt .LBB0_4
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    dls lr, r1
-; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:  .LBB0_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r0], #32
-; CHECK-NEXT:    vaddva.s32 r2, q0
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #-16]
-; CHECK-NEXT:    vaddva.s32 r2, q0
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #32
+; CHECK-NEXT:    vaddva.s32 r0, q0
+; CHECK-NEXT:    vldrw.u32 q0, [r1, #-16]
+; CHECK-NEXT:    vaddva.s32 r0, q0
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
-; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp11 = icmp sgt i32 %N, 0

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index 3e7ed0b096b82..cba0f9cbba2ca 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -212,9 +212,9 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu
 ; CHECK-NEXT:    beq .LBB2_8
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
 ; CHECK-NEXT:    ldr r3, [sp, #64]
-; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r9, [sp, #56]
+; CHECK-NEXT:    ldr.w r11, [sp, #56]
 ; CHECK-NEXT:    add.w r0, r1, r3, lsl #1
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r1, r3
@@ -235,15 +235,15 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu
 ; CHECK-NEXT:    add.w r1, r8, r10
 ; CHECK-NEXT:    add r1, r6
 ; CHECK-NEXT:    add r1, r12
-; CHECK-NEXT:    strb.w r1, [r3, r11]
-; CHECK-NEXT:    add.w r11, r11, #1
-; CHECK-NEXT:    cmp r11, r2
+; CHECK-NEXT:    strb.w r1, [r3, r9]
+; CHECK-NEXT:    add.w r9, r9, #1
+; CHECK-NEXT:    cmp r9, r2
 ; CHECK-NEXT:    beq .LBB2_8
 ; CHECK-NEXT:  .LBB2_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB2_7 Depth 2
 ; CHECK-NEXT:    ldr r1, [sp, #68]
-; CHECK-NEXT:    ldr.w r12, [r1, r11, lsl #2]
+; CHECK-NEXT:    ldr.w r12, [r1, r9, lsl #2]
 ; CHECK-NEXT:    subs r1, r0, r0
 ; CHECK-NEXT:    ble .LBB2_3
 ; CHECK-NEXT:  @ %bb.6: @ %for.body24.preheader
@@ -254,7 +254,7 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu
 ; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mla r7, r11, r7, r3
+; CHECK-NEXT:    mla r7, r9, r7, r3
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r10, r12
@@ -262,17 +262,17 @@ define ptr @test(ptr nocapture readonly %input_row, ptr nocapture readonly %inpu
 ; CHECK-NEXT:    @ Parent Loop BB2_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
-; CHECK-NEXT:    vadd.i16 q1, q0, r9
+; CHECK-NEXT:    vadd.i16 q1, q0, r11
 ; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
 ; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
 ; CHECK-NEXT:    vmlava.s16 r6, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
 ; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
 ; CHECK-NEXT:    vmlava.s16 r10, q0, q1
 ; CHECK-NEXT:    le lr, .LBB2_7
 ; CHECK-NEXT:    b .LBB2_4
@@ -395,9 +395,9 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon
 ; CHECK-NEXT:    beq .LBB3_8
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
 ; CHECK-NEXT:    ldr r3, [sp, #64]
-; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r9, [sp, #56]
+; CHECK-NEXT:    ldr.w r11, [sp, #56]
 ; CHECK-NEXT:    add.w r0, r1, r3, lsl #1
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r1, r3
@@ -411,7 +411,7 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB3_5 Depth 2
 ; CHECK-NEXT:    ldr r1, [sp, #68]
-; CHECK-NEXT:    ldr.w r12, [r1, r11, lsl #2]
+; CHECK-NEXT:    ldr.w r12, [r1, r9, lsl #2]
 ; CHECK-NEXT:    subs r1, r0, r0
 ; CHECK-NEXT:    ble .LBB3_6
 ; CHECK-NEXT:  @ %bb.4: @ %for.body24.preheader
@@ -422,7 +422,7 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon
 ; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mla r7, r11, r7, r3
+; CHECK-NEXT:    mla r7, r9, r7, r3
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r10, r12
@@ -430,17 +430,17 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon
 ; CHECK-NEXT:    @ Parent Loop BB3_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
-; CHECK-NEXT:    vadd.i16 q1, q0, r9
+; CHECK-NEXT:    vadd.i16 q1, q0, r11
 ; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
 ; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
 ; CHECK-NEXT:    vmlava.s16 r6, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
 ; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r11
 ; CHECK-NEXT:    vmlava.s16 r10, q0, q1
 ; CHECK-NEXT:    le lr, .LBB3_5
 ; CHECK-NEXT:    b .LBB3_7
@@ -454,9 +454,9 @@ define ptr @test_optsize(ptr nocapture readonly %input_row, ptr nocapture readon
 ; CHECK-NEXT:    add.w r1, r8, r10
 ; CHECK-NEXT:    add r1, r6
 ; CHECK-NEXT:    add r1, r12
-; CHECK-NEXT:    strb.w r1, [r3, r11]
-; CHECK-NEXT:    add.w r11, r11, #1
-; CHECK-NEXT:    cmp r11, r2
+; CHECK-NEXT:    strb.w r1, [r3, r9]
+; CHECK-NEXT:    add.w r9, r9, #1
+; CHECK-NEXT:    cmp r9, r2
 ; CHECK-NEXT:    bne .LBB3_3
 ; CHECK-NEXT:  .LBB3_8: @ %if.end
 ; CHECK-NEXT:    ldr r0, [sp, #72]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index 16e7736886e94..29b56639bd769 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -12,13 +12,12 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:    beq.w .LBB0_8
 ; CHECK-NEXT:  @ %bb.1: @ %entry
 ; CHECK-NEXT:    mov r11, r2
-; CHECK-NEXT:    mov r8, r1
 ; CHECK-NEXT:    cmp r3, #1
 ; CHECK-NEXT:    bne .LBB0_3
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    mov r8, r1
 ; CHECK-NEXT:    mov r10, r11
 ; CHECK-NEXT:    b .LBB0_6
 ; CHECK-NEXT:  .LBB0_3: @ %vector.ph
@@ -30,7 +29,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:    add.w r10, r11, r2, lsl #2
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
 ; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
-; CHECK-NEXT:    add.w r1, r8, r2, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r2, lsl #2
 ; CHECK-NEXT:    add.w r12, r0, r2, lsl #2
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
@@ -38,7 +37,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrd r4, r2, [r0], #8
 ; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    ldrd r7, r6, [r8], #8
+; CHECK-NEXT:    ldrd r7, r6, [r1], #8
 ; CHECK-NEXT:    smull r4, r7, r7, r4
 ; CHECK-NEXT:    asrl r4, r7, #31
 ; CHECK-NEXT:    rsbs.w r9, r4, #-2147483648
@@ -81,22 +80,22 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader
 ; CHECK-NEXT:    sub.w lr, r3, r2
 ; CHECK-NEXT:    mov.w r0, #-1
-; CHECK-NEXT:    mov.w r2, #-2147483648
+; CHECK-NEXT:    mov.w r1, #-2147483648
 ; CHECK-NEXT:    mvn r3, #-2147483648
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r4, [r12], #4
-; CHECK-NEXT:    ldr r5, [r1], #4
-; CHECK-NEXT:    smull r4, r5, r5, r4
-; CHECK-NEXT:    asrl r4, r5, #31
-; CHECK-NEXT:    subs r6, r2, r4
-; CHECK-NEXT:    sbcs.w r6, r0, r5
-; CHECK-NEXT:    csel r4, r4, r2, lt
-; CHECK-NEXT:    csel r5, r5, r0, lt
-; CHECK-NEXT:    subs r6, r4, r3
-; CHECK-NEXT:    sbcs r5, r5, #0
-; CHECK-NEXT:    csel r4, r4, r3, lt
-; CHECK-NEXT:    str r4, [r10], #4
+; CHECK-NEXT:    ldr r2, [r12], #4
+; CHECK-NEXT:    ldr r4, [r8], #4
+; CHECK-NEXT:    smull r2, r5, r4, r2
+; CHECK-NEXT:    asrl r2, r5, #31
+; CHECK-NEXT:    subs r4, r1, r2
+; CHECK-NEXT:    sbcs.w r4, r0, r5
+; CHECK-NEXT:    csel r2, r2, r1, lt
+; CHECK-NEXT:    csel r4, r5, r0, lt
+; CHECK-NEXT:    subs r5, r2, r3
+; CHECK-NEXT:    sbcs r4, r4, #0
+; CHECK-NEXT:    csel r2, r2, r3, lt
+; CHECK-NEXT:    str r2, [r10], #4
 ; CHECK-NEXT:    le lr, .LBB0_7
 ; CHECK-NEXT:  .LBB0_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #8

diff  --git a/llvm/test/CodeGen/Thumb2/pr52817.ll b/llvm/test/CodeGen/Thumb2/pr52817.ll
index 4cc0960e1f57f..87615f0a1f7ef 100644
--- a/llvm/test/CodeGen/Thumb2/pr52817.ll
+++ b/llvm/test/CodeGen/Thumb2/pr52817.ll
@@ -18,25 +18,25 @@ define i32 @test(ptr %arg, ptr %arg1, ptr %arg2) #0 !dbg !6 {
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    add r7, sp, #12
 ; CHECK-NEXT:    str r8, [sp, #-4]!
-; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    mov.w lr, #0
+; CHECK-NEXT:    mov.w r9, #1
+; CHECK-NEXT:    movw r12, #4100
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    mov.w r12, #1
-; CHECK-NEXT:    movw lr, #4100
 ; CHECK-NEXT:  LBB0_1: @ %bb3
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r5, r3, #1
-; CHECK-NEXT:    str.w r9, [r2]
-; CHECK-NEXT:    cmp.w r9, #0
+; CHECK-NEXT:    str.w lr, [r2]
+; CHECK-NEXT:    cmp.w lr, #0
 ; CHECK-NEXT:    add.w r4, r0, r5, lsl #2
-; CHECK-NEXT:    add.w r8, r4, lr
-; CHECK-NEXT:    lsl.w r4, r12, r3
+; CHECK-NEXT:    add.w r8, r4, r12
+; CHECK-NEXT:    lsl.w r4, r9, r3
 ; CHECK-NEXT:    and.w r3, r3, r4
 ; CHECK-NEXT:    add.w r4, r1, r5, lsl #2
 ; CHECK-NEXT:    itte ne
 ; CHECK-NEXT:    movne r6, #0
 ; CHECK-NEXT:  Ltmp0:
 ; CHECK-NEXT:    @DEBUG_VALUE: test:this <- [DW_OP_LLVM_arg 0, DW_OP_plus_uconst 135168, DW_OP_LLVM_arg 1, DW_OP_constu 4, DW_OP_mul, DW_OP_plus, DW_OP_plus_uconst 4, DW_OP_stack_value] $r0, $r5
-; CHECK-NEXT:    .loc 1 28 24 prologue_end @ test.cpp:28:24 @[ test.cpp:204:23 ]
+; CHECK-NEXT:    .loc 1 28 24 prologue_end @ test.cpp:28:24
 ; CHECK-NEXT:    strne.w r6, [r8]
 ; CHECK-NEXT:    moveq r6, #1
 ; CHECK-NEXT:    ldr r4, [r4, #4]

diff  --git a/llvm/test/CodeGen/VE/Scalar/br_jt.ll b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
index a418ef4892b33..fd880a7f42912 100644
--- a/llvm/test/CodeGen/VE/Scalar/br_jt.ll
+++ b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
@@ -641,53 +641,53 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; PIC:       # %bb.0:
 ; PIC-NEXT:    st %s15, 24(, %s11)
 ; PIC-NEXT:    st %s16, 32(, %s11)
-; PIC-NEXT:    and %s0, %s0, (32)0
-; PIC-NEXT:    adds.w.sx %s3, -1, %s0
-; PIC-NEXT:    cmpu.w %s2, 8, %s3
+; PIC-NEXT:    and %s2, %s0, (32)0
+; PIC-NEXT:    adds.w.sx %s0, -1, %s2
+; PIC-NEXT:    cmpu.w %s3, 8, %s0
 ; PIC-NEXT:    lea %s15, _GLOBAL_OFFSET_TABLE_ at pc_lo(-24)
 ; PIC-NEXT:    and %s15, %s15, (32)0
 ; PIC-NEXT:    sic %s16
 ; PIC-NEXT:    lea.sl %s15, _GLOBAL_OFFSET_TABLE_ at pc_hi(%s16, %s15)
-; PIC-NEXT:    brgt.w 0, %s2, .LBB7_9
+; PIC-NEXT:    brgt.w 0, %s3, .LBB7_9
 ; PIC-NEXT:  # %bb.1:
-; PIC-NEXT:    and %s2, %s1, (32)0
-; PIC-NEXT:    adds.w.zx %s1, %s3, (0)1
-; PIC-NEXT:    sll %s1, %s1, 2
+; PIC-NEXT:    and %s1, %s1, (32)0
+; PIC-NEXT:    adds.w.zx %s0, %s0, (0)1
+; PIC-NEXT:    sll %s0, %s0, 2
 ; PIC-NEXT:    lea %s3, .LJTI7_0 at gotoff_lo
 ; PIC-NEXT:    and %s3, %s3, (32)0
 ; PIC-NEXT:    lea.sl %s3, .LJTI7_0 at gotoff_hi(%s3, %s15)
-; PIC-NEXT:    ldl.sx %s1, (%s1, %s3)
+; PIC-NEXT:    ldl.sx %s0, (%s0, %s3)
 ; PIC-NEXT:    lea %s3, br_jt8_m at gotoff_lo
 ; PIC-NEXT:    and %s3, %s3, (32)0
 ; PIC-NEXT:    lea.sl %s3, br_jt8_m at gotoff_hi(%s3, %s15)
-; PIC-NEXT:    adds.l %s3, %s3, %s1
-; PIC-NEXT:    or %s1, 3, (0)1
+; PIC-NEXT:    adds.l %s3, %s3, %s0
+; PIC-NEXT:    or %s0, 3, (0)1
 ; PIC-NEXT:    b.l.t (, %s3)
 ; PIC-NEXT:  .LBB7_2:
-; PIC-NEXT:    or %s1, 0, (0)1
+; PIC-NEXT:    or %s0, 0, (0)1
 ; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_9:
-; PIC-NEXT:    or %s1, 0, %s0
+; PIC-NEXT:    or %s0, 0, %s2
 ; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_6:
-; PIC-NEXT:    adds.w.sx %s1, -2, %s2
+; PIC-NEXT:    adds.w.sx %s0, -2, %s1
 ; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_8:
-; PIC-NEXT:    or %s1, 11, (0)1
+; PIC-NEXT:    or %s0, 11, (0)1
 ; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_7:
-; PIC-NEXT:    or %s1, 10, (0)1
+; PIC-NEXT:    or %s0, 10, (0)1
 ; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_3:
-; PIC-NEXT:    or %s1, 4, (0)1
+; PIC-NEXT:    or %s0, 4, (0)1
 ; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_4:
-; PIC-NEXT:    adds.w.sx %s1, 3, %s2
+; PIC-NEXT:    adds.w.sx %s0, 3, %s1
 ; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_5:
-; PIC-NEXT:    adds.w.sx %s1, -5, %s2
+; PIC-NEXT:    adds.w.sx %s0, -5, %s1
 ; PIC-NEXT:  .LBB7_10:
-; PIC-NEXT:    adds.w.sx %s0, %s1, (0)1
+; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    ld %s16, 32(, %s11)
 ; PIC-NEXT:    ld %s15, 24(, %s11)
 ; PIC-NEXT:    b.l.t (, %s10)

diff  --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
index 99338d8f063f5..1962ddebc2115 100644
--- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
+++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
@@ -126,14 +126,14 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-NEXT:    jmp LBB0_8
 ; CHECK-NEXT:  LBB0_18: ## %bb43
-; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:  Ltmp5:
+; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    calll _OnOverFlow
 ; CHECK-NEXT:  Ltmp6:
 ; CHECK-NEXT:    jmp LBB0_3
 ; CHECK-NEXT:  LBB0_2: ## %bb29
-; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:  Ltmp7:
+; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    calll _OnOverFlow
 ; CHECK-NEXT:  Ltmp8:
 ; CHECK-NEXT:  LBB0_3: ## %bb30

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
index 8a8e7a3b4df2c..06cf968512db8 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@@ -297,30 +297,30 @@ define dso_local void @test6(i16 signext %0) nounwind {
 ; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movl $buf, %edx
-; CHECK-NEXT:    movl $32, %esi
+; CHECK-NEXT:    movl $buf, %ecx
+; CHECK-NEXT:    movl $32, %edx
+; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    jmp .LBB5_1
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB5_3: # %if.false
 ; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT:    decl %eax
+; CHECK-NEXT:    decl %esi
 ; CHECK-NEXT:  .LBB5_4: # %loop.bb2
 ; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT:    leal (%rdi,%rax), %r8d
+; CHECK-NEXT:    leal (%rdi,%rsi), %r8d
 ; CHECK-NEXT:    movw %r8w, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    cmpw $7, %ax
+; CHECK-NEXT:    cmpw $7, %si
 ; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    tilezero %tmm0
-; CHECK-NEXT:    tilestored %tmm0, (%rdx,%rsi)
+; CHECK-NEXT:    tilestored %tmm0, (%rcx,%rdx)
 ; CHECK-NEXT:    jne .LBB5_5
 ; CHECK-NEXT:  .LBB5_1: # %loop.bb1
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jne .LBB5_3
 ; CHECK-NEXT:  # %bb.2: # %if.true
 ; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT:    incl %eax
+; CHECK-NEXT:    incl %esi
 ; CHECK-NEXT:    jmp .LBB5_4
 ; CHECK-NEXT:  .LBB5_5: # %exit
 ; CHECK-NEXT:    tilerelease

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
index fda0244cea6eb..4fb0a4445862f 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
@@ -242,25 +242,25 @@ define dso_local void @test3(ptr%buf) nounwind {
 ; CHECK-NEXT:    jne .LBB1_3
 ; CHECK-NEXT:  # %bb.1: # %loop.header.preheader
 ; CHECK-NEXT:    movq %rdi, %rbx
-; CHECK-NEXT:    xorl %r14d, %r14d
-; CHECK-NEXT:    movl $32, %r15d
+; CHECK-NEXT:    movl $32, %r14d
+; CHECK-NEXT:    xorl %r15d, %r15d
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_2: # %loop.header
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    tilestored %tmm0, (%rbx,%r15)
+; CHECK-NEXT:    tilestored %tmm0, (%rbx,%r14)
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    tilezero %tmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
 ; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    tilezero %tmm0
-; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm1
-; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm2
+; CHECK-NEXT:    tileloadd (%rbx,%r14), %tmm1
+; CHECK-NEXT:    tileloadd (%rbx,%r14), %tmm2
 ; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
-; CHECK-NEXT:    tilestored %tmm0, (%rbx,%r15)
+; CHECK-NEXT:    tilestored %tmm0, (%rbx,%r14)
 ; CHECK-NEXT:    tilezero %tmm0
-; CHECK-NEXT:    incl %r14d
-; CHECK-NEXT:    cmpw $100, %r14w
+; CHECK-NEXT:    incl %r15d
+; CHECK-NEXT:    cmpw $100, %r15w
 ; CHECK-NEXT:    jl .LBB1_2
 ; CHECK-NEXT:  .LBB1_3: # %exit
 ; CHECK-NEXT:    addq $72, %rsp
@@ -297,12 +297,12 @@ define dso_local void @test3(ptr%buf) nounwind {
 ; EGPR-NEXT:    # fixup A - offset: 1, value: .LBB1_3-1, kind: FK_PCRel_1
 ; EGPR-NEXT:  # %bb.1: # %loop.header.preheader
 ; EGPR-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
-; EGPR-NEXT:    xorl %r14d, %r14d # encoding: [0x45,0x31,0xf6]
-; EGPR-NEXT:    movl $32, %r15d # encoding: [0x41,0xbf,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    movl $32, %r14d # encoding: [0x41,0xbe,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    xorl %r15d, %r15d # encoding: [0x45,0x31,0xff]
 ; EGPR-NEXT:    .p2align 4
 ; EGPR-NEXT:  .LBB1_2: # %loop.header
 ; EGPR-NEXT:    # =>This Inner Loop Header: Depth=1
-; EGPR-NEXT:    tilestored %tmm0, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x3b]
+; EGPR-NEXT:    tilestored %tmm0, (%rbx,%r14) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x33]
 ; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; EGPR-NEXT:    tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
 ; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
@@ -310,13 +310,13 @@ define dso_local void @test3(ptr%buf) nounwind {
 ; EGPR-NEXT:    # fixup A - offset: 1, value: foo-4, kind: reloc_branch_4byte_pcrel
 ; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x08]
 ; EGPR-NEXT:    tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
-; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b]
-; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x14,0x3b]
+; EGPR-NEXT:    tileloadd (%rbx,%r14), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x33]
+; EGPR-NEXT:    tileloadd (%rbx,%r14), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x14,0x33]
 ; EGPR-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0 # encoding: [0xc4,0xe2,0x6b,0x5e,0xc1]
-; EGPR-NEXT:    tilestored %tmm0, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x3b]
+; EGPR-NEXT:    tilestored %tmm0, (%rbx,%r14) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x33]
 ; EGPR-NEXT:    tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
-; EGPR-NEXT:    incl %r14d # encoding: [0x41,0xff,0xc6]
-; EGPR-NEXT:    cmpw $100, %r14w # encoding: [0x66,0x41,0x83,0xfe,0x64]
+; EGPR-NEXT:    incl %r15d # encoding: [0x41,0xff,0xc7]
+; EGPR-NEXT:    cmpw $100, %r15w # encoding: [0x66,0x41,0x83,0xff,0x64]
 ; EGPR-NEXT:    jl .LBB1_2 # encoding: [0x7c,A]
 ; EGPR-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
 ; EGPR-NEXT:  .LBB1_3: # %exit

diff  --git a/llvm/test/CodeGen/X86/atomic32.ll b/llvm/test/CodeGen/X86/atomic32.ll
index 3263b9f61a281..f4666738db7d2 100644
--- a/llvm/test/CodeGen/X86/atomic32.ll
+++ b/llvm/test/CodeGen/X86/atomic32.ll
@@ -228,10 +228,9 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ; X64-NEXT:    andl %edx, %ecx
 ; X64-NEXT:    notl %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, sc32(%rip)
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    sete %al
-; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    testb $1, %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB5_2
 ; X64-NEXT:    jmp .LBB5_1
 ; X64-NEXT:  .LBB5_2: # %atomicrmw.end
@@ -252,10 +251,9 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ; X86-NEXT:    andl %edx, %ecx
 ; X86-NEXT:    notl %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb $1, %al
+; X86-NEXT:    sete %cl
+; X86-NEXT:    testb $1, %cl
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB5_2
 ; X86-NEXT:    jmp .LBB5_1
 ; X86-NEXT:  .LBB5_2: # %atomicrmw.end
@@ -279,10 +277,9 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X64-NEXT:    subl %ecx, %edx
 ; X64-NEXT:    cmovgl %eax, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, sc32(%rip)
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    sete %al
-; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    testb $1, %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB6_2
 ; X64-NEXT:    jmp .LBB6_1
 ; X64-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -303,10 +300,9 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl %ecx, %edx
 ; X86-CMOV-NEXT:    cmovgl %eax, %ecx
 ; X86-CMOV-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    sete %al
-; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-CMOV-NEXT:    testb $1, %al
+; X86-CMOV-NEXT:    sete %cl
+; X86-CMOV-NEXT:    testb $1, %cl
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB6_2
 ; X86-CMOV-NEXT:    jmp .LBB6_1
 ; X86-CMOV-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -338,10 +334,9 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    sete %al
-; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOCMOV-NEXT:    testb $1, %al
+; X86-NOCMOV-NEXT:    sete %cl
+; X86-NOCMOV-NEXT:    testb $1, %cl
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB6_2
 ; X86-NOCMOV-NEXT:    jmp .LBB6_1
 ; X86-NOCMOV-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -373,10 +368,9 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOX87-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NOX87-NEXT:    movl %eax, %ecx
-; X86-NOX87-NEXT:    sete %al
-; X86-NOX87-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    testb $1, %al
+; X86-NOX87-NEXT:    sete %cl
+; X86-NOX87-NEXT:    testb $1, %cl
+; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    jne .LBB6_2
 ; X86-NOX87-NEXT:    jmp .LBB6_1
 ; X86-NOX87-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -400,10 +394,9 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X64-NEXT:    subl %ecx, %edx
 ; X64-NEXT:    cmovlel %eax, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, sc32(%rip)
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    sete %al
-; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    testb $1, %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB7_2
 ; X64-NEXT:    jmp .LBB7_1
 ; X64-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -424,10 +417,9 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl %ecx, %edx
 ; X86-CMOV-NEXT:    cmovlel %eax, %ecx
 ; X86-CMOV-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    sete %al
-; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-CMOV-NEXT:    testb $1, %al
+; X86-CMOV-NEXT:    sete %cl
+; X86-CMOV-NEXT:    testb $1, %cl
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB7_2
 ; X86-CMOV-NEXT:    jmp .LBB7_1
 ; X86-CMOV-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -459,10 +451,9 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    sete %al
-; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOCMOV-NEXT:    testb $1, %al
+; X86-NOCMOV-NEXT:    sete %cl
+; X86-NOCMOV-NEXT:    testb $1, %cl
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB7_2
 ; X86-NOCMOV-NEXT:    jmp .LBB7_1
 ; X86-NOCMOV-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -494,10 +485,9 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOX87-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NOX87-NEXT:    movl %eax, %ecx
-; X86-NOX87-NEXT:    sete %al
-; X86-NOX87-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    testb $1, %al
+; X86-NOX87-NEXT:    sete %cl
+; X86-NOX87-NEXT:    testb $1, %cl
+; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    jne .LBB7_2
 ; X86-NOX87-NEXT:    jmp .LBB7_1
 ; X86-NOX87-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -521,10 +511,9 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X64-NEXT:    subl %ecx, %edx
 ; X64-NEXT:    cmoval %eax, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, sc32(%rip)
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    sete %al
-; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    testb $1, %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB8_2
 ; X64-NEXT:    jmp .LBB8_1
 ; X64-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -545,10 +534,9 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl %ecx, %edx
 ; X86-CMOV-NEXT:    cmoval %eax, %ecx
 ; X86-CMOV-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    sete %al
-; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-CMOV-NEXT:    testb $1, %al
+; X86-CMOV-NEXT:    sete %cl
+; X86-CMOV-NEXT:    testb $1, %cl
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB8_2
 ; X86-CMOV-NEXT:    jmp .LBB8_1
 ; X86-CMOV-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -580,10 +568,9 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    sete %al
-; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOCMOV-NEXT:    testb $1, %al
+; X86-NOCMOV-NEXT:    sete %cl
+; X86-NOCMOV-NEXT:    testb $1, %cl
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB8_2
 ; X86-NOCMOV-NEXT:    jmp .LBB8_1
 ; X86-NOCMOV-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -615,10 +602,9 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOX87-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NOX87-NEXT:    movl %eax, %ecx
-; X86-NOX87-NEXT:    sete %al
-; X86-NOX87-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    testb $1, %al
+; X86-NOX87-NEXT:    sete %cl
+; X86-NOX87-NEXT:    testb $1, %cl
+; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    jne .LBB8_2
 ; X86-NOX87-NEXT:    jmp .LBB8_1
 ; X86-NOX87-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -642,10 +628,9 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X64-NEXT:    subl %ecx, %edx
 ; X64-NEXT:    cmovbel %eax, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, sc32(%rip)
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    sete %al
-; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    testb $1, %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB9_2
 ; X64-NEXT:    jmp .LBB9_1
 ; X64-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -666,10 +651,9 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl %ecx, %edx
 ; X86-CMOV-NEXT:    cmovbel %eax, %ecx
 ; X86-CMOV-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    sete %al
-; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-CMOV-NEXT:    testb $1, %al
+; X86-CMOV-NEXT:    sete %cl
+; X86-CMOV-NEXT:    testb $1, %cl
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB9_2
 ; X86-CMOV-NEXT:    jmp .LBB9_1
 ; X86-CMOV-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -701,10 +685,9 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    sete %al
-; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOCMOV-NEXT:    testb $1, %al
+; X86-NOCMOV-NEXT:    sete %cl
+; X86-NOCMOV-NEXT:    testb $1, %cl
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB9_2
 ; X86-NOCMOV-NEXT:    jmp .LBB9_1
 ; X86-NOCMOV-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -736,10 +719,9 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOX87-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NOX87-NEXT:    movl %eax, %ecx
-; X86-NOX87-NEXT:    sete %al
-; X86-NOX87-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    testb $1, %al
+; X86-NOX87-NEXT:    sete %cl
+; X86-NOX87-NEXT:    testb $1, %cl
+; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    jne .LBB9_2
 ; X86-NOX87-NEXT:    jmp .LBB9_1
 ; X86-NOX87-NEXT:  .LBB9_2: # %atomicrmw.end

diff  --git a/llvm/test/CodeGen/X86/atomic64.ll b/llvm/test/CodeGen/X86/atomic64.ll
index 6fc72bcf67ec5..8f4da356e06cb 100644
--- a/llvm/test/CodeGen/X86/atomic64.ll
+++ b/llvm/test/CodeGen/X86/atomic64.ll
@@ -275,10 +275,9 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
 ; X64-NEXT:    andq %rdx, %rcx
 ; X64-NEXT:    notq %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, sc64(%rip)
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    testb $1, %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB5_2
 ; X64-NEXT:    jmp .LBB5_1
 ; X64-NEXT:  .LBB5_2: # %atomicrmw.end
@@ -315,10 +314,9 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
 ; X64-NEXT:    subq %rcx, %rdx
 ; X64-NEXT:    cmovgq %rax, %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, sc64(%rip)
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    testb $1, %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB6_2
 ; X64-NEXT:    jmp .LBB6_1
 ; X64-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -408,10 +406,9 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
 ; X64-NEXT:    subq %rcx, %rdx
 ; X64-NEXT:    cmovleq %rax, %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, sc64(%rip)
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    testb $1, %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB7_2
 ; X64-NEXT:    jmp .LBB7_1
 ; X64-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -501,10 +498,9 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
 ; X64-NEXT:    subq %rcx, %rdx
 ; X64-NEXT:    cmovaq %rax, %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, sc64(%rip)
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    testb $1, %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB8_2
 ; X64-NEXT:    jmp .LBB8_1
 ; X64-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -594,10 +590,9 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 ; X64-NEXT:    subq %rcx, %rdx
 ; X64-NEXT:    cmovbeq %rax, %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, sc64(%rip)
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    testb $1, %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB9_2
 ; X64-NEXT:    jmp .LBB9_1
 ; X64-NEXT:  .LBB9_2: # %atomicrmw.end

diff  --git a/llvm/test/CodeGen/X86/atomic6432.ll b/llvm/test/CodeGen/X86/atomic6432.ll
index 3d0617370a1bb..8ff5f338e1482 100644
--- a/llvm/test/CodeGen/X86/atomic6432.ll
+++ b/llvm/test/CodeGen/X86/atomic6432.ll
@@ -16,15 +16,15 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    jmp .LBB0_1
 ; X32-NEXT:  .LBB0_1: # %atomicrmw.start14
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl $1, %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB0_1
 ; X32-NEXT:    jmp .LBB0_2
 ; X32-NEXT:  .LBB0_2: # %atomicrmw.end13
@@ -35,15 +35,15 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    jmp .LBB0_3
 ; X32-NEXT:  .LBB0_3: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl $3, %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB0_3
 ; X32-NEXT:    jmp .LBB0_4
 ; X32-NEXT:  .LBB0_4: # %atomicrmw.end7
@@ -75,8 +75,8 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    jmp .LBB0_7
 ; X32-NEXT:  .LBB0_7: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
@@ -84,8 +84,8 @@ define void @atomic_fetch_add64() nounwind {
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB0_7
 ; X32-NEXT:    jmp .LBB0_8
 ; X32-NEXT:  .LBB0_8: # %atomicrmw.end
@@ -114,15 +114,15 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    jmp .LBB1_1
 ; X32-NEXT:  .LBB1_1: # %atomicrmw.start14
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl $-1, %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $-1, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB1_1
 ; X32-NEXT:    jmp .LBB1_2
 ; X32-NEXT:  .LBB1_2: # %atomicrmw.end13
@@ -133,15 +133,15 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    jmp .LBB1_3
 ; X32-NEXT:  .LBB1_3: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl $-3, %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $-1, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB1_3
 ; X32-NEXT:    jmp .LBB1_4
 ; X32-NEXT:  .LBB1_4: # %atomicrmw.end7
@@ -173,8 +173,8 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    jmp .LBB1_7
 ; X32-NEXT:  .LBB1_7: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
@@ -182,8 +182,8 @@ define void @atomic_fetch_sub64() nounwind {
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    sbbl %esi, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB1_7
 ; X32-NEXT:    jmp .LBB1_8
 ; X32-NEXT:  .LBB1_8: # %atomicrmw.end
@@ -211,14 +211,14 @@ define void @atomic_fetch_and64() nounwind {
 ; X32-NEXT:    jmp .LBB2_1
 ; X32-NEXT:  .LBB2_1: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    andl $3, %ebx
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB2_1
 ; X32-NEXT:    jmp .LBB2_2
 ; X32-NEXT:  .LBB2_2: # %atomicrmw.end7
@@ -250,8 +250,8 @@ define void @atomic_fetch_and64() nounwind {
 ; X32-NEXT:    jmp .LBB2_5
 ; X32-NEXT:  .LBB2_5: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
@@ -259,8 +259,8 @@ define void @atomic_fetch_and64() nounwind {
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    andl %esi, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB2_5
 ; X32-NEXT:    jmp .LBB2_6
 ; X32-NEXT:  .LBB2_6: # %atomicrmw.end
@@ -287,14 +287,14 @@ define void @atomic_fetch_or64() nounwind {
 ; X32-NEXT:    jmp .LBB3_1
 ; X32-NEXT:  .LBB3_1: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    orl $3, %ebx
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB3_1
 ; X32-NEXT:    jmp .LBB3_2
 ; X32-NEXT:  .LBB3_2: # %atomicrmw.end7
@@ -326,8 +326,8 @@ define void @atomic_fetch_or64() nounwind {
 ; X32-NEXT:    jmp .LBB3_5
 ; X32-NEXT:  .LBB3_5: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
@@ -335,8 +335,8 @@ define void @atomic_fetch_or64() nounwind {
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    orl %esi, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB3_5
 ; X32-NEXT:    jmp .LBB3_6
 ; X32-NEXT:  .LBB3_6: # %atomicrmw.end
@@ -363,14 +363,14 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32-NEXT:    jmp .LBB4_1
 ; X32-NEXT:  .LBB4_1: # %atomicrmw.start8
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    xorl $3, %ebx
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB4_1
 ; X32-NEXT:    jmp .LBB4_2
 ; X32-NEXT:  .LBB4_2: # %atomicrmw.end7
@@ -402,8 +402,8 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32-NEXT:    jmp .LBB4_5
 ; X32-NEXT:  .LBB4_5: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ebx
@@ -411,8 +411,8 @@ define void @atomic_fetch_xor64() nounwind {
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    xorl %esi, %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB4_5
 ; X32-NEXT:    jmp .LBB4_6
 ; X32-NEXT:  .LBB4_6: # %atomicrmw.end
@@ -444,8 +444,8 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
 ; X32-NEXT:    jmp .LBB5_1
 ; X32-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edx, %ecx
@@ -455,8 +455,8 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
 ; X32-NEXT:    notl %ebx
 ; X32-NEXT:    notl %ecx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB5_1
 ; X32-NEXT:    jmp .LBB5_2
 ; X32-NEXT:  .LBB5_2: # %atomicrmw.end
@@ -486,8 +486,8 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
 ; X32-NEXT:    jmp .LBB6_1
 ; X32-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %esi
@@ -497,8 +497,8 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
 ; X32-NEXT:    cmovll %edx, %ecx
 ; X32-NEXT:    cmovll %eax, %ebx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB6_1
 ; X32-NEXT:    jmp .LBB6_2
 ; X32-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -527,8 +527,8 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
 ; X32-NEXT:    jmp .LBB7_1
 ; X32-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %esi
@@ -538,8 +538,8 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
 ; X32-NEXT:    cmovgel %edx, %ecx
 ; X32-NEXT:    cmovgel %eax, %ebx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB7_1
 ; X32-NEXT:    jmp .LBB7_2
 ; X32-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -568,8 +568,8 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
 ; X32-NEXT:    jmp .LBB8_1
 ; X32-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %esi
@@ -579,8 +579,8 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
 ; X32-NEXT:    cmovbl %edx, %ecx
 ; X32-NEXT:    cmovbl %eax, %ebx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB8_1
 ; X32-NEXT:    jmp .LBB8_2
 ; X32-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -609,8 +609,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 ; X32-NEXT:    jmp .LBB9_1
 ; X32-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %esi
@@ -620,8 +620,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 ; X32-NEXT:    cmovael %edx, %ecx
 ; X32-NEXT:    cmovael %eax, %ebx
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB9_1
 ; X32-NEXT:    jmp .LBB9_2
 ; X32-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -677,13 +677,13 @@ define void @atomic_fetch_swap64(i64 %x) nounwind {
 ; X32-NEXT:    jmp .LBB12_1
 ; X32-NEXT:  .LBB12_1: # %atomicrmw.start
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    lock cmpxchg8b sc64
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    jne .LBB12_1
 ; X32-NEXT:    jmp .LBB12_2
 ; X32-NEXT:  .LBB12_2: # %atomicrmw.end

diff  --git a/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll b/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
index 95faca819e975..3d389523dffb3 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
+++ b/llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll
@@ -17,7 +17,7 @@ define dso_local void @n(ptr %o, i32 %p, i32 %u) nounwind {
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    movl %edx, %ebp
-; CHECK-NEXT:    movl %esi, %r13d
+; CHECK-NEXT:    movl %esi, %r12d
 ; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    callq c
 ; CHECK-NEXT:    movl %eax, %r14d
@@ -28,17 +28,17 @@ define dso_local void @n(ptr %o, i32 %p, i32 %u) nounwind {
 ; CHECK-NEXT:  # %bb.1: # %if.end
 ; CHECK-NEXT:    cmpl $0, e(%rip)
 ; CHECK-NEXT:    # implicit-def: $r15d
-; CHECK-NEXT:    # implicit-def: $r12d
+; CHECK-NEXT:    # implicit-def: $r13d
 ; CHECK-NEXT:    je .LBB0_4
 ; CHECK-NEXT:  # %bb.2: # %if.then4
-; CHECK-NEXT:    movslq %r13d, %rdi
+; CHECK-NEXT:    movslq %r12d, %rdi
 ; CHECK-NEXT:    callq m
 ; CHECK-NEXT:    # implicit-def: $r15d
 ; CHECK-NEXT:    # implicit-def: $r12d
 ; CHECK-NEXT:  .LBB0_3: # %r
 ; CHECK-NEXT:    callq c
-; CHECK-NEXT:  .LBB0_4: # %if.end8
 ; CHECK-NEXT:    movl %r12d, %r13d
+; CHECK-NEXT:  .LBB0_4: # %if.end8
 ; CHECK-NEXT:    movl %r15d, %edi
 ; CHECK-NEXT:    callq i
 ; CHECK-NEXT:    movl %eax, %r12d

diff  --git a/llvm/test/CodeGen/X86/callbr-asm-kill.mir b/llvm/test/CodeGen/X86/callbr-asm-kill.mir
index 58a9e4de77faf..5aabeade52da1 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-kill.mir
+++ b/llvm/test/CodeGen/X86/callbr-asm-kill.mir
@@ -6,12 +6,14 @@
 # subsequent use of [[MOV64rm]] in the INLINEASM_BR instruction which should be
 # killed instead.
 --- |
+  ; ModuleID = '<stdin>'
   source_filename = "<stdin>"
   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-linux-gnu"
 
   declare void @foo(ptr)
 
+  ; Function Attrs: nounwind
   define void @test1(ptr %arg, ptr %mem) #0 {
   entry:
     br label %loop
@@ -55,17 +57,18 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY killed $rsi
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY killed $rdi
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64 = COPY killed [[COPY1]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.loop (ir-block-address-taken %ir-block.loop, inlineasm-br-indirect-target):
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64 = COPY killed [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gr64 = COPY killed [[COPY2]]
   ; CHECK-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s64) from %ir.mem)
   ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   $rdi = COPY killed [[COPY2]]
+  ; CHECK-NEXT:   $rdi = COPY killed [[COPY3]]
   ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @foo, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp
   ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY [[MOV64rm]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr64 = COPY [[MOV64rm]]
   ; CHECK-NEXT:   INLINEASM_BR &"", 9 /* sideeffect mayload attdialect */, 262190 /* mem:m */, killed [[MOV64rm]], 1, $noreg, 0, $noreg, 13 /* imm */, blockaddress(@test1, %ir-block.loop)
   ; CHECK-NEXT:   JMP_1 %bb.2
   ; CHECK-NEXT: {{  $}}

diff  --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
index fe8d131a977ea..e9f529eea7d3f 100644
--- a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
+++ b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
@@ -49,8 +49,8 @@ define void @foo(ptr %arg3, i1 %icmp16) #0 {
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: # %bb5
 ; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    movq %r14, %r15
 ; CHECK-NEXT:    orl $1, %r12d
+; CHECK-NEXT:    movq %r14, %r15
 ; CHECK-NEXT:  .LBB0_2: # %bb7
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    xorl %eax, %eax

diff  --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 845c6cd6f2454..70335f834291d 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -328,9 +328,9 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) {
 ; SSE:       # %bb.0: # %start
 ; SSE-NEXT:    movd %esi, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movq $-2097152, %rax # imm = 0xFFE00000
 ; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    movq $-2097152, %rax # imm = 0xFFE00000
+; SSE-NEXT:    pxor %xmm1, %xmm1
 ; SSE-NEXT:    .p2align 4
 ; SSE-NEXT:  .LBB7_1: # %loop
 ; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -469,11 +469,11 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
 ; SSE-NEXT:    movslq %esi, %rax
 ; SSE-NEXT:    movq %rax, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm0
 ; SSE-NEXT:    movq $-2097152, %rax # imm = 0xFFE00000
 ; SSE-NEXT:    movdqa %xmm2, %xmm3
 ; SSE-NEXT:    psrlq $32, %xmm3
-; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
 ; SSE-NEXT:    .p2align 4
 ; SSE-NEXT:  .LBB8_1: # %loop
 ; SSE-NEXT:    # =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll
index ca1e676e91f35..0486c1c4d28e9 100644
--- a/llvm/test/CodeGen/X86/fp128-select.ll
+++ b/llvm/test/CodeGen/X86/fp128-select.ll
@@ -14,7 +14,7 @@ define void @test_select(ptr %p, ptr %q, i1 zeroext %c) {
 ; SSE-NEXT:    testl %edx, %edx
 ; SSE-NEXT:    jne .LBB0_1
 ; SSE-NEXT:  # %bb.3:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [NaN]
+; SSE-NEXT:    movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    movaps %xmm0, (%rsi)
 ; SSE-NEXT:    retq
 ; SSE-NEXT:  .LBB0_1:
@@ -61,7 +61,7 @@ define fp128 @test_select_cc(fp128, fp128) {
 ; SSE-NEXT:    xorps %xmm1, %xmm1
 ; SSE-NEXT:    jmp .LBB1_3
 ; SSE-NEXT:  .LBB1_1:
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0]
+; SSE-NEXT:    movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE-NEXT:  .LBB1_3: # %BB0
 ; SSE-NEXT:    testl %ebx, %ebx
 ; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
@@ -103,18 +103,17 @@ define fp128 @test_select_cc(fp128, fp128) {
 ; NOSSE-NEXT:    movq %r12, %rdx
 ; NOSSE-NEXT:    movq %r15, %rcx
 ; NOSSE-NEXT:    callq __eqtf2 at PLT
-; NOSSE-NEXT:    xorl %ecx, %ecx
-; NOSSE-NEXT:    testl %eax, %eax
-; NOSSE-NEXT:    movabsq $4611404543450677248, %rax # imm = 0x3FFF000000000000
-; NOSSE-NEXT:    cmovneq %rcx, %rax
+; NOSSE-NEXT:    movl %eax, %ecx
+; NOSSE-NEXT:    xorl %eax, %eax
+; NOSSE-NEXT:    testl %ecx, %ecx
+; NOSSE-NEXT:    movabsq $4611404543450677248, %rdx # imm = 0x3FFF000000000000
+; NOSSE-NEXT:    cmovneq %rax, %rdx
 ; NOSSE-NEXT:    testl %ebp, %ebp
-; NOSSE-NEXT:    jne .LBB1_2
-; NOSSE-NEXT:  # %bb.1: # %BB1
-; NOSSE-NEXT:    xorl %r14d, %r14d
-; NOSSE-NEXT:    movq %rax, %rbx
-; NOSSE-NEXT:  .LBB1_2: # %BB2
+; NOSSE-NEXT:    je .LBB1_2
+; NOSSE-NEXT:  # %bb.1:
 ; NOSSE-NEXT:    movq %r14, %rax
 ; NOSSE-NEXT:    movq %rbx, %rdx
+; NOSSE-NEXT:  .LBB1_2: # %BB2
 ; NOSSE-NEXT:    popq %rbx
 ; NOSSE-NEXT:    .cfi_def_cfa_offset 40
 ; NOSSE-NEXT:    popq %r12

diff  --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index 6ca9e77b9a555..bdb7c307a5759 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -196,10 +196,10 @@ define i32 @_Z10test_shortPsS_i_512(ptr nocapture readonly, ptr nocapture readon
 ; SSE2-LABEL: _Z10test_shortPsS_i_512:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movl %edx, %eax
-; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    xorl %ecx, %ecx
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    .p2align 4
 ; SSE2-NEXT:  .LBB2_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -210,19 +210,19 @@ define i32 @_Z10test_shortPsS_i_512(ptr nocapture readonly, ptr nocapture readon
 ; SSE2-NEXT:    paddd %xmm5, %xmm2
 ; SSE2-NEXT:    movdqu 16(%rsi,%rcx,2), %xmm3
 ; SSE2-NEXT:    pmaddwd %xmm4, %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    addq $16, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB2_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: _Z10test_shortPsS_i_512:
@@ -397,9 +397,9 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    movl %edx, %eax
 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    xorl %ecx, %ecx
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    .p2align 4
 ; AVX1-NEXT:  .LBB3_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -445,8 +445,8 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    .p2align 4
 ; AVX2-NEXT:  .LBB3_1: # %vector.body
@@ -454,16 +454,16 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado
 ; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm3
 ; AVX2-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm4
 ; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3
+; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    addq $16, %rcx
 ; AVX2-NEXT:    cmpq %rcx, %rax
 ; AVX2-NEXT:    jne .LBB3_1
 ; AVX2-NEXT:  # %bb.2: # %middle.block
-; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -770,10 +770,10 @@ define i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly
 ; SSE2-LABEL: _Z9test_charPcS_i_512:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movl %edx, %eax
-; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    xorl %ecx, %ecx
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    .p2align 4
 ; SSE2-NEXT:  .LBB6_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -790,19 +790,19 @@ define i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm4
 ; SSE2-NEXT:    pmaddwd %xmm3, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm4, %xmm1
 ; SSE2-NEXT:    addq $16, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB6_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: _Z9test_charPcS_i_512:
@@ -993,9 +993,9 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    movl %edx, %eax
 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    xorl %ecx, %ecx
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    .p2align 4
 ; AVX1-NEXT:  .LBB7_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1045,8 +1045,8 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    .p2align 4
 ; AVX2-NEXT:  .LBB7_1: # %vector.body
@@ -1055,17 +1055,17 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl
 ; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
 ; AVX2-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
 ; AVX2-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
 ; AVX2-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    addq $32, %rcx
 ; AVX2-NEXT:    cmpq %rcx, %rax
 ; AVX2-NEXT:    jne .LBB7_1
 ; AVX2-NEXT:  # %bb.2: # %middle.block
-; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1271,19 +1271,19 @@ define i32 @test_unsigned_short_256(ptr nocapture readonly, ptr nocapture readon
 ; SSE2-NEXT:    pmullw %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    addq $16, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB9_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_unsigned_short_256:
@@ -1442,22 +1442,22 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon
 ; AVX1-NEXT:    vpmulld %xmm4, %xmm6, %xmm4
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; AVX1-NEXT:    vpmulld %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
 ; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpaddd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    addq $16, %rcx
 ; AVX1-NEXT:    cmpq %rcx, %rax
 ; AVX1-NEXT:    jne .LBB10_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -1480,15 +1480,15 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmulld %ymm2, %ymm4, %ymm2
-; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmulld %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    addq $16, %rcx
 ; AVX2-NEXT:    cmpq %rcx, %rax
 ; AVX2-NEXT:    jne .LBB10_1
 ; AVX2-NEXT:  # %bb.2: # %middle.block
-; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1765,15 +1765,15 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512-NEXT:    vpmulld %zmm2, %zmm4, %zmm2
-; AVX512-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512-NEXT:    vpmulld %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
+; AVX512-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; AVX512-NEXT:    addq $16, %rcx
 ; AVX512-NEXT:    cmpq %rcx, %rax
 ; AVX512-NEXT:    jne .LBB11_1
 ; AVX512-NEXT:  # %bb.2: # %middle.block
-; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -2739,9 +2739,9 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movl %esi, %eax
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    .p2align 4
 ; SSE2-NEXT:  .LBB33_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3101,10 +3101,10 @@ define i32 @add_used_by_loop_phi(ptr %a, ptr %b, i64 %offset_a, i64 %offset_b, i
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    addq %rdx, %rdi
 ; SSE2-NEXT:    addq %rcx, %rsi
-; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    .p2align 4
 ; SSE2-NEXT:  .LBB38_1: # %loop
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3121,19 +3121,19 @@ define i32 @add_used_by_loop_phi(ptr %a, ptr %b, i64 %offset_a, i64 %offset_b, i
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm3
 ; SSE2-NEXT:    pmaddwd %xmm4, %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    addq $16, %rax
 ; SSE2-NEXT:    cmpq %r8, %rax
 ; SSE2-NEXT:    jb .LBB38_1
 ; SSE2-NEXT:  # %bb.2: # %afterloop
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: add_used_by_loop_phi:

diff  --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 76a94f825266a..8c4bab99a5b7b 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -226,11 +226,12 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, ptr %addr, <4 x double
 define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) {
 ; SSE-LABEL: load_v4f64_v4i32_zero:
 ; SSE:       ## %bb.0:
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE-NEXT:    movmskps %xmm1, %eax
 ; SSE-NEXT:    testb $1, %al
-; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
 ; SSE-NEXT:    jne LBB3_1
 ; SSE-NEXT:  ## %bb.2: ## %else
 ; SSE-NEXT:    testb $2, %al
@@ -244,7 +245,7 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) {
 ; SSE-NEXT:  LBB3_8: ## %else8
 ; SSE-NEXT:    retq
 ; SSE-NEXT:  LBB3_1: ## %cond.load
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    testb $2, %al
 ; SSE-NEXT:    je LBB3_4
 ; SSE-NEXT:  LBB3_3: ## %cond.load1
@@ -1095,9 +1096,9 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
 ; SSE2-NEXT:    psllw $15, %xmm0
 ; SSE2-NEXT:    packsswb %xmm0, %xmm0
 ; SSE2-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    jne LBB10_1
 ; SSE2-NEXT:  ## %bb.2: ## %else
 ; SSE2-NEXT:    testb $2, %al
@@ -1174,9 +1175,9 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
 ; SSE42-NEXT:    psllw $15, %xmm0
 ; SSE42-NEXT:    packsswb %xmm0, %xmm0
 ; SSE42-NEXT:    pmovmskb %xmm0, %eax
-; SSE42-NEXT:    xorps %xmm1, %xmm1
-; SSE42-NEXT:    testb $1, %al
 ; SSE42-NEXT:    pxor %xmm0, %xmm0
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    xorps %xmm1, %xmm1
 ; SSE42-NEXT:    jne LBB10_1
 ; SSE42-NEXT:  ## %bb.2: ## %else
 ; SSE42-NEXT:    testb $2, %al
@@ -2613,9 +2614,9 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
 ; SSE2-NEXT:    psllw $15, %xmm0
 ; SSE2-NEXT:    packsswb %xmm0, %xmm0
 ; SSE2-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    testb $1, %al
+; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    jne LBB20_1
 ; SSE2-NEXT:  ## %bb.2: ## %else
 ; SSE2-NEXT:    testb $2, %al
@@ -2692,9 +2693,9 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
 ; SSE42-NEXT:    psllw $15, %xmm0
 ; SSE42-NEXT:    packsswb %xmm0, %xmm0
 ; SSE42-NEXT:    pmovmskb %xmm0, %eax
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    testb $1, %al
 ; SSE42-NEXT:    pxor %xmm0, %xmm0
+; SSE42-NEXT:    testb $1, %al
+; SSE42-NEXT:    pxor %xmm1, %xmm1
 ; SSE42-NEXT:    jne LBB20_1
 ; SSE42-NEXT:  ## %bb.2: ## %else
 ; SSE42-NEXT:    testb $2, %al

diff  --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 0ebe7d4ea588d..d752659f94a50 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -174,8 +174,8 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
 ; CHECK-SKX:       # %bb.0: # %entry
 ; CHECK-SKX-NEXT:    movl %edx, %eax
 ; CHECK-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-SKX-NEXT:    xorl %ecx, %ecx
+; CHECK-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-SKX-NEXT:    .p2align 4
 ; CHECK-SKX-NEXT:  .LBB8_1: # %vector.body
@@ -184,17 +184,17 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
 ; CHECK-SKX-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
 ; CHECK-SKX-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
 ; CHECK-SKX-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
-; CHECK-SKX-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; CHECK-SKX-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; CHECK-SKX-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
 ; CHECK-SKX-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
-; CHECK-SKX-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; CHECK-SKX-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; CHECK-SKX-NEXT:    addq $32, %rcx
 ; CHECK-SKX-NEXT:    cmpq %rcx, %rax
 ; CHECK-SKX-NEXT:    jne .LBB8_1
 ; CHECK-SKX-NEXT:  # %bb.2: # %middle.block
-; CHECK-SKX-NEXT:    vpaddd %ymm0, %ymm2, %ymm2
-; CHECK-SKX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-SKX-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
 ; CHECK-SKX-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-SKX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; CHECK-SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; CHECK-SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -209,8 +209,8 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
 ; CHECK-AVX512:       # %bb.0: # %entry
 ; CHECK-AVX512-NEXT:    movl %edx, %eax
 ; CHECK-AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX512-NEXT:    xorl %ecx, %ecx
+; CHECK-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX512-NEXT:    .p2align 4
 ; CHECK-AVX512-NEXT:  .LBB8_1: # %vector.body
@@ -219,17 +219,17 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
 ; CHECK-AVX512-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
 ; CHECK-AVX512-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
 ; CHECK-AVX512-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
-; CHECK-AVX512-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; CHECK-AVX512-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; CHECK-AVX512-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
 ; CHECK-AVX512-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
-; CHECK-AVX512-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; CHECK-AVX512-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; CHECK-AVX512-NEXT:    addq $32, %rcx
 ; CHECK-AVX512-NEXT:    cmpq %rcx, %rax
 ; CHECK-AVX512-NEXT:    jne .LBB8_1
 ; CHECK-AVX512-NEXT:  # %bb.2: # %middle.block
-; CHECK-AVX512-NEXT:    vpaddd %ymm0, %ymm2, %ymm2
-; CHECK-AVX512-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-AVX512-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
 ; CHECK-AVX512-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-AVX512-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; CHECK-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; CHECK-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -244,8 +244,8 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
 ; CHECK-VBMI:       # %bb.0: # %entry
 ; CHECK-VBMI-NEXT:    movl %edx, %eax
 ; CHECK-VBMI-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-VBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-VBMI-NEXT:    xorl %ecx, %ecx
+; CHECK-VBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-VBMI-NEXT:    .p2align 4
 ; CHECK-VBMI-NEXT:  .LBB8_1: # %vector.body
@@ -254,17 +254,17 @@ define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocaptur
 ; CHECK-VBMI-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
 ; CHECK-VBMI-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
 ; CHECK-VBMI-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
-; CHECK-VBMI-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; CHECK-VBMI-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; CHECK-VBMI-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
 ; CHECK-VBMI-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
-; CHECK-VBMI-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; CHECK-VBMI-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; CHECK-VBMI-NEXT:    addq $32, %rcx
 ; CHECK-VBMI-NEXT:    cmpq %rcx, %rax
 ; CHECK-VBMI-NEXT:    jne .LBB8_1
 ; CHECK-VBMI-NEXT:  # %bb.2: # %middle.block
-; CHECK-VBMI-NEXT:    vpaddd %ymm0, %ymm2, %ymm2
-; CHECK-VBMI-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-VBMI-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
 ; CHECK-VBMI-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-VBMI-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; CHECK-VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; CHECK-VBMI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-VBMI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]

diff  --git a/llvm/test/CodeGen/X86/pcsections-atomics.ll b/llvm/test/CodeGen/X86/pcsections-atomics.ll
index c84bafa612948..672ebc1ec7275 100644
--- a/llvm/test/CodeGen/X86/pcsections-atomics.ll
+++ b/llvm/test/CodeGen/X86/pcsections-atomics.ll
@@ -689,12 +689,11 @@ define void @atomic8_nand_monotonic(ptr %a) {
 ; O0-NEXT:    orb $-43, %dl
 ; O0-NEXT:  .Lpcsection19:
 ; O0-NEXT:    lock cmpxchgb %dl, (%rcx)
-; O0-NEXT:    movb %al, %cl
 ; O0-NEXT:  .Lpcsection20:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection21:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; O0-NEXT:  .Lpcsection22:
 ; O0-NEXT:    jne .LBB16_2
 ; O0-NEXT:    jmp .LBB16_1
@@ -1028,12 +1027,11 @@ define void @atomic8_nand_acquire(ptr %a) {
 ; O0-NEXT:    orb $-43, %dl
 ; O0-NEXT:  .Lpcsection32:
 ; O0-NEXT:    lock cmpxchgb %dl, (%rcx)
-; O0-NEXT:    movb %al, %cl
 ; O0-NEXT:  .Lpcsection33:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection34:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; O0-NEXT:  .Lpcsection35:
 ; O0-NEXT:    jne .LBB23_2
 ; O0-NEXT:    jmp .LBB23_1
@@ -1367,12 +1365,11 @@ define void @atomic8_nand_release(ptr %a) {
 ; O0-NEXT:    orb $-43, %dl
 ; O0-NEXT:  .Lpcsection45:
 ; O0-NEXT:    lock cmpxchgb %dl, (%rcx)
-; O0-NEXT:    movb %al, %cl
 ; O0-NEXT:  .Lpcsection46:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection47:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; O0-NEXT:  .Lpcsection48:
 ; O0-NEXT:    jne .LBB30_2
 ; O0-NEXT:    jmp .LBB30_1
@@ -1706,12 +1703,11 @@ define void @atomic8_nand_acq_rel(ptr %a) {
 ; O0-NEXT:    orb $-43, %dl
 ; O0-NEXT:  .Lpcsection58:
 ; O0-NEXT:    lock cmpxchgb %dl, (%rcx)
-; O0-NEXT:    movb %al, %cl
 ; O0-NEXT:  .Lpcsection59:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection60:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; O0-NEXT:  .Lpcsection61:
 ; O0-NEXT:    jne .LBB37_2
 ; O0-NEXT:    jmp .LBB37_1
@@ -2045,12 +2041,11 @@ define void @atomic8_nand_seq_cst(ptr %a) {
 ; O0-NEXT:    orb $-43, %dl
 ; O0-NEXT:  .Lpcsection71:
 ; O0-NEXT:    lock cmpxchgb %dl, (%rcx)
-; O0-NEXT:    movb %al, %cl
 ; O0-NEXT:  .Lpcsection72:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection73:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; O0-NEXT:  .Lpcsection74:
 ; O0-NEXT:    jne .LBB44_2
 ; O0-NEXT:    jmp .LBB44_1
@@ -3139,12 +3134,11 @@ define void @atomic16_nand_monotonic(ptr %a) {
 ; O0-NEXT:    # kill: def $dx killed $dx killed $edx
 ; O0-NEXT:  .Lpcsection119:
 ; O0-NEXT:    lock cmpxchgw %dx, (%rcx)
-; O0-NEXT:    movw %ax, %cx
 ; O0-NEXT:  .Lpcsection120:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection121:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; O0-NEXT:  .Lpcsection122:
 ; O0-NEXT:    jne .LBB64_2
 ; O0-NEXT:    jmp .LBB64_1
@@ -3494,12 +3488,11 @@ define void @atomic16_nand_acquire(ptr %a) {
 ; O0-NEXT:    # kill: def $dx killed $dx killed $edx
 ; O0-NEXT:  .Lpcsection134:
 ; O0-NEXT:    lock cmpxchgw %dx, (%rcx)
-; O0-NEXT:    movw %ax, %cx
 ; O0-NEXT:  .Lpcsection135:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection136:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; O0-NEXT:  .Lpcsection137:
 ; O0-NEXT:    jne .LBB71_2
 ; O0-NEXT:    jmp .LBB71_1
@@ -3849,12 +3842,11 @@ define void @atomic16_nand_release(ptr %a) {
 ; O0-NEXT:    # kill: def $dx killed $dx killed $edx
 ; O0-NEXT:  .Lpcsection149:
 ; O0-NEXT:    lock cmpxchgw %dx, (%rcx)
-; O0-NEXT:    movw %ax, %cx
 ; O0-NEXT:  .Lpcsection150:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection151:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; O0-NEXT:  .Lpcsection152:
 ; O0-NEXT:    jne .LBB78_2
 ; O0-NEXT:    jmp .LBB78_1
@@ -4204,12 +4196,11 @@ define void @atomic16_nand_acq_rel(ptr %a) {
 ; O0-NEXT:    # kill: def $dx killed $dx killed $edx
 ; O0-NEXT:  .Lpcsection164:
 ; O0-NEXT:    lock cmpxchgw %dx, (%rcx)
-; O0-NEXT:    movw %ax, %cx
 ; O0-NEXT:  .Lpcsection165:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection166:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; O0-NEXT:  .Lpcsection167:
 ; O0-NEXT:    jne .LBB85_2
 ; O0-NEXT:    jmp .LBB85_1
@@ -4559,12 +4550,11 @@ define void @atomic16_nand_seq_cst(ptr %a) {
 ; O0-NEXT:    # kill: def $dx killed $dx killed $edx
 ; O0-NEXT:  .Lpcsection179:
 ; O0-NEXT:    lock cmpxchgw %dx, (%rcx)
-; O0-NEXT:    movw %ax, %cx
 ; O0-NEXT:  .Lpcsection180:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection181:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; O0-NEXT:  .Lpcsection182:
 ; O0-NEXT:    jne .LBB92_2
 ; O0-NEXT:    jmp .LBB92_1
@@ -5616,12 +5606,11 @@ define void @atomic32_nand_monotonic(ptr %a) {
 ; O0-NEXT:    orl $-43, %edx
 ; O0-NEXT:  .Lpcsection225:
 ; O0-NEXT:    lock cmpxchgl %edx, (%rcx)
-; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection226:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection227:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; O0-NEXT:  .Lpcsection228:
 ; O0-NEXT:    jne .LBB112_2
 ; O0-NEXT:    jmp .LBB112_1
@@ -5955,12 +5944,11 @@ define void @atomic32_nand_acquire(ptr %a) {
 ; O0-NEXT:    orl $-43, %edx
 ; O0-NEXT:  .Lpcsection238:
 ; O0-NEXT:    lock cmpxchgl %edx, (%rcx)
-; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection239:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection240:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; O0-NEXT:  .Lpcsection241:
 ; O0-NEXT:    jne .LBB119_2
 ; O0-NEXT:    jmp .LBB119_1
@@ -6294,12 +6282,11 @@ define void @atomic32_nand_release(ptr %a) {
 ; O0-NEXT:    orl $-43, %edx
 ; O0-NEXT:  .Lpcsection251:
 ; O0-NEXT:    lock cmpxchgl %edx, (%rcx)
-; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection252:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection253:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; O0-NEXT:  .Lpcsection254:
 ; O0-NEXT:    jne .LBB126_2
 ; O0-NEXT:    jmp .LBB126_1
@@ -6633,12 +6620,11 @@ define void @atomic32_nand_acq_rel(ptr %a) {
 ; O0-NEXT:    orl $-43, %edx
 ; O0-NEXT:  .Lpcsection264:
 ; O0-NEXT:    lock cmpxchgl %edx, (%rcx)
-; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection265:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection266:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; O0-NEXT:  .Lpcsection267:
 ; O0-NEXT:    jne .LBB133_2
 ; O0-NEXT:    jmp .LBB133_1
@@ -6972,12 +6958,11 @@ define void @atomic32_nand_seq_cst(ptr %a) {
 ; O0-NEXT:    orl $-43, %edx
 ; O0-NEXT:  .Lpcsection277:
 ; O0-NEXT:    lock cmpxchgl %edx, (%rcx)
-; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection278:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection279:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; O0-NEXT:  .Lpcsection280:
 ; O0-NEXT:    jne .LBB140_2
 ; O0-NEXT:    jmp .LBB140_1
@@ -8143,12 +8128,11 @@ define void @atomic64_nand_monotonic(ptr %a) {
 ; O0-NEXT:    orq $-43, %rdx
 ; O0-NEXT:  .Lpcsection326:
 ; O0-NEXT:    lock cmpxchgq %rdx, (%rcx)
-; O0-NEXT:    movq %rax, %rcx
 ; O0-NEXT:  .Lpcsection327:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection328:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection329:
 ; O0-NEXT:    jne .LBB162_2
 ; O0-NEXT:    jmp .LBB162_1
@@ -8485,12 +8469,11 @@ define void @atomic64_nand_acquire(ptr %a) {
 ; O0-NEXT:    orq $-43, %rdx
 ; O0-NEXT:  .Lpcsection340:
 ; O0-NEXT:    lock cmpxchgq %rdx, (%rcx)
-; O0-NEXT:    movq %rax, %rcx
 ; O0-NEXT:  .Lpcsection341:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection342:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection343:
 ; O0-NEXT:    jne .LBB169_2
 ; O0-NEXT:    jmp .LBB169_1
@@ -8827,12 +8810,11 @@ define void @atomic64_nand_release(ptr %a) {
 ; O0-NEXT:    orq $-43, %rdx
 ; O0-NEXT:  .Lpcsection354:
 ; O0-NEXT:    lock cmpxchgq %rdx, (%rcx)
-; O0-NEXT:    movq %rax, %rcx
 ; O0-NEXT:  .Lpcsection355:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection356:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection357:
 ; O0-NEXT:    jne .LBB176_2
 ; O0-NEXT:    jmp .LBB176_1
@@ -9169,12 +9151,11 @@ define void @atomic64_nand_acq_rel(ptr %a) {
 ; O0-NEXT:    orq $-43, %rdx
 ; O0-NEXT:  .Lpcsection368:
 ; O0-NEXT:    lock cmpxchgq %rdx, (%rcx)
-; O0-NEXT:    movq %rax, %rcx
 ; O0-NEXT:  .Lpcsection369:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection370:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection371:
 ; O0-NEXT:    jne .LBB183_2
 ; O0-NEXT:    jmp .LBB183_1
@@ -9511,12 +9492,11 @@ define void @atomic64_nand_seq_cst(ptr %a) {
 ; O0-NEXT:    orq $-43, %rdx
 ; O0-NEXT:  .Lpcsection382:
 ; O0-NEXT:    lock cmpxchgq %rdx, (%rcx)
-; O0-NEXT:    movq %rax, %rcx
 ; O0-NEXT:  .Lpcsection383:
-; O0-NEXT:    sete %al
-; O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    sete %cl
 ; O0-NEXT:  .Lpcsection384:
-; O0-NEXT:    testb $1, %al
+; O0-NEXT:    testb $1, %cl
+; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection385:
 ; O0-NEXT:    jne .LBB190_2
 ; O0-NEXT:    jmp .LBB190_1
@@ -10547,8 +10527,8 @@ define void @atomic128_store_unordered(ptr %a) {
 ; O0-NEXT:    jmp .LBB203_1
 ; O0-NEXT:  .LBB203_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:  .Lpcsection426:
 ; O0-NEXT:    xorl %ecx, %ecx
@@ -10558,8 +10538,8 @@ define void @atomic128_store_unordered(ptr %a) {
 ; O0-NEXT:    movl $42, %ebx
 ; O0-NEXT:  .Lpcsection429:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection430:
 ; O0-NEXT:    jne .LBB203_1
 ; O0-NEXT:    jmp .LBB203_2
@@ -10674,8 +10654,8 @@ define void @atomic128_store_monotonic(ptr %a) {
 ; O0-NEXT:    jmp .LBB204_1
 ; O0-NEXT:  .LBB204_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:  .Lpcsection434:
 ; O0-NEXT:    xorl %ecx, %ecx
@@ -10685,8 +10665,8 @@ define void @atomic128_store_monotonic(ptr %a) {
 ; O0-NEXT:    movl $42, %ebx
 ; O0-NEXT:  .Lpcsection437:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection438:
 ; O0-NEXT:    jne .LBB204_1
 ; O0-NEXT:    jmp .LBB204_2
@@ -10801,8 +10781,8 @@ define void @atomic128_store_release(ptr %a) {
 ; O0-NEXT:    jmp .LBB205_1
 ; O0-NEXT:  .LBB205_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:  .Lpcsection442:
 ; O0-NEXT:    xorl %ecx, %ecx
@@ -10812,8 +10792,8 @@ define void @atomic128_store_release(ptr %a) {
 ; O0-NEXT:    movl $42, %ebx
 ; O0-NEXT:  .Lpcsection445:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection446:
 ; O0-NEXT:    jne .LBB205_1
 ; O0-NEXT:    jmp .LBB205_2
@@ -10928,8 +10908,8 @@ define void @atomic128_store_seq_cst(ptr %a) {
 ; O0-NEXT:    jmp .LBB206_1
 ; O0-NEXT:  .LBB206_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:  .Lpcsection450:
 ; O0-NEXT:    xorl %ecx, %ecx
@@ -10939,8 +10919,8 @@ define void @atomic128_store_seq_cst(ptr %a) {
 ; O0-NEXT:    movl $42, %ebx
 ; O0-NEXT:  .Lpcsection453:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection454:
 ; O0-NEXT:    jne .LBB206_1
 ; O0-NEXT:    jmp .LBB206_2
@@ -11094,8 +11074,8 @@ define void @atomic128_xchg_monotonic(ptr %a) {
 ; O0-NEXT:    jmp .LBB208_1
 ; O0-NEXT:  .LBB208_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:  .Lpcsection459:
 ; O0-NEXT:    xorl %ecx, %ecx
@@ -11105,8 +11085,8 @@ define void @atomic128_xchg_monotonic(ptr %a) {
 ; O0-NEXT:    movl $42, %ebx
 ; O0-NEXT:  .Lpcsection462:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection463:
 ; O0-NEXT:    jne .LBB208_1
 ; O0-NEXT:    jmp .LBB208_2
@@ -11221,8 +11201,8 @@ define void @atomic128_add_monotonic(ptr %a) {
 ; O0-NEXT:    jmp .LBB209_1
 ; O0-NEXT:  .LBB209_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection467:
@@ -11232,8 +11212,8 @@ define void @atomic128_add_monotonic(ptr %a) {
 ; O0-NEXT:    adcq $0, %rcx
 ; O0-NEXT:  .Lpcsection469:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection470:
 ; O0-NEXT:    jne .LBB209_1
 ; O0-NEXT:    jmp .LBB209_2
@@ -11354,8 +11334,8 @@ define void @atomic128_sub_monotonic(ptr %a) {
 ; O0-NEXT:    jmp .LBB210_1
 ; O0-NEXT:  .LBB210_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection474:
@@ -11365,8 +11345,8 @@ define void @atomic128_sub_monotonic(ptr %a) {
 ; O0-NEXT:    adcq $-1, %rcx
 ; O0-NEXT:  .Lpcsection476:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection477:
 ; O0-NEXT:    jne .LBB210_1
 ; O0-NEXT:    jmp .LBB210_2
@@ -11487,8 +11467,8 @@ define void @atomic128_and_monotonic(ptr %a) {
 ; O0-NEXT:    jmp .LBB211_1
 ; O0-NEXT:  .LBB211_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection481:
@@ -11500,8 +11480,8 @@ define void @atomic128_and_monotonic(ptr %a) {
 ; O0-NEXT:    # kill: def $rcx killed $ecx
 ; O0-NEXT:  .Lpcsection484:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection485:
 ; O0-NEXT:    jne .LBB211_1
 ; O0-NEXT:    jmp .LBB211_2
@@ -11619,8 +11599,8 @@ define void @atomic128_or_monotonic(ptr %a) {
 ; O0-NEXT:    jmp .LBB212_1
 ; O0-NEXT:  .LBB212_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection489:
@@ -11628,8 +11608,8 @@ define void @atomic128_or_monotonic(ptr %a) {
 ; O0-NEXT:    movq %rcx, %rdx
 ; O0-NEXT:  .Lpcsection490:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection491:
 ; O0-NEXT:    jne .LBB212_1
 ; O0-NEXT:    jmp .LBB212_2
@@ -11744,8 +11724,8 @@ define void @atomic128_xor_monotonic(ptr %a) {
 ; O0-NEXT:    jmp .LBB213_1
 ; O0-NEXT:  .LBB213_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection495:
@@ -11753,8 +11733,8 @@ define void @atomic128_xor_monotonic(ptr %a) {
 ; O0-NEXT:    movq %rcx, %rdx
 ; O0-NEXT:  .Lpcsection496:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection497:
 ; O0-NEXT:    jne .LBB213_1
 ; O0-NEXT:    jmp .LBB213_2
@@ -11869,8 +11849,8 @@ define void @atomic128_nand_monotonic(ptr %a) {
 ; O0-NEXT:    jmp .LBB214_1
 ; O0-NEXT:  .LBB214_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection501:
@@ -11884,8 +11864,8 @@ define void @atomic128_nand_monotonic(ptr %a) {
 ; O0-NEXT:    movq $-1, %rcx
 ; O0-NEXT:  .Lpcsection505:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection506:
 ; O0-NEXT:    jne .LBB214_1
 ; O0-NEXT:    jmp .LBB214_2
@@ -12009,8 +11989,8 @@ define void @atomic128_xchg_acquire(ptr %a) {
 ; O0-NEXT:    jmp .LBB215_1
 ; O0-NEXT:  .LBB215_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:  .Lpcsection510:
 ; O0-NEXT:    xorl %ecx, %ecx
@@ -12020,8 +12000,8 @@ define void @atomic128_xchg_acquire(ptr %a) {
 ; O0-NEXT:    movl $42, %ebx
 ; O0-NEXT:  .Lpcsection513:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection514:
 ; O0-NEXT:    jne .LBB215_1
 ; O0-NEXT:    jmp .LBB215_2
@@ -12136,8 +12116,8 @@ define void @atomic128_add_acquire(ptr %a) {
 ; O0-NEXT:    jmp .LBB216_1
 ; O0-NEXT:  .LBB216_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection518:
@@ -12147,8 +12127,8 @@ define void @atomic128_add_acquire(ptr %a) {
 ; O0-NEXT:    adcq $0, %rcx
 ; O0-NEXT:  .Lpcsection520:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection521:
 ; O0-NEXT:    jne .LBB216_1
 ; O0-NEXT:    jmp .LBB216_2
@@ -12269,8 +12249,8 @@ define void @atomic128_sub_acquire(ptr %a) {
 ; O0-NEXT:    jmp .LBB217_1
 ; O0-NEXT:  .LBB217_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection525:
@@ -12280,8 +12260,8 @@ define void @atomic128_sub_acquire(ptr %a) {
 ; O0-NEXT:    adcq $-1, %rcx
 ; O0-NEXT:  .Lpcsection527:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection528:
 ; O0-NEXT:    jne .LBB217_1
 ; O0-NEXT:    jmp .LBB217_2
@@ -12402,8 +12382,8 @@ define void @atomic128_and_acquire(ptr %a) {
 ; O0-NEXT:    jmp .LBB218_1
 ; O0-NEXT:  .LBB218_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection532:
@@ -12415,8 +12395,8 @@ define void @atomic128_and_acquire(ptr %a) {
 ; O0-NEXT:    # kill: def $rcx killed $ecx
 ; O0-NEXT:  .Lpcsection535:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection536:
 ; O0-NEXT:    jne .LBB218_1
 ; O0-NEXT:    jmp .LBB218_2
@@ -12534,8 +12514,8 @@ define void @atomic128_or_acquire(ptr %a) {
 ; O0-NEXT:    jmp .LBB219_1
 ; O0-NEXT:  .LBB219_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection540:
@@ -12543,8 +12523,8 @@ define void @atomic128_or_acquire(ptr %a) {
 ; O0-NEXT:    movq %rcx, %rdx
 ; O0-NEXT:  .Lpcsection541:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection542:
 ; O0-NEXT:    jne .LBB219_1
 ; O0-NEXT:    jmp .LBB219_2
@@ -12659,8 +12639,8 @@ define void @atomic128_xor_acquire(ptr %a) {
 ; O0-NEXT:    jmp .LBB220_1
 ; O0-NEXT:  .LBB220_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection546:
@@ -12668,8 +12648,8 @@ define void @atomic128_xor_acquire(ptr %a) {
 ; O0-NEXT:    movq %rcx, %rdx
 ; O0-NEXT:  .Lpcsection547:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection548:
 ; O0-NEXT:    jne .LBB220_1
 ; O0-NEXT:    jmp .LBB220_2
@@ -12784,8 +12764,8 @@ define void @atomic128_nand_acquire(ptr %a) {
 ; O0-NEXT:    jmp .LBB221_1
 ; O0-NEXT:  .LBB221_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection552:
@@ -12799,8 +12779,8 @@ define void @atomic128_nand_acquire(ptr %a) {
 ; O0-NEXT:    movq $-1, %rcx
 ; O0-NEXT:  .Lpcsection556:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection557:
 ; O0-NEXT:    jne .LBB221_1
 ; O0-NEXT:    jmp .LBB221_2
@@ -12924,8 +12904,8 @@ define void @atomic128_xchg_release(ptr %a) {
 ; O0-NEXT:    jmp .LBB222_1
 ; O0-NEXT:  .LBB222_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:  .Lpcsection561:
 ; O0-NEXT:    xorl %ecx, %ecx
@@ -12935,8 +12915,8 @@ define void @atomic128_xchg_release(ptr %a) {
 ; O0-NEXT:    movl $42, %ebx
 ; O0-NEXT:  .Lpcsection564:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection565:
 ; O0-NEXT:    jne .LBB222_1
 ; O0-NEXT:    jmp .LBB222_2
@@ -13050,8 +13030,8 @@ define void @atomic128_add_release(ptr %a) {
 ; O0-NEXT:    jmp .LBB223_1
 ; O0-NEXT:  .LBB223_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection569:
@@ -13061,8 +13041,8 @@ define void @atomic128_add_release(ptr %a) {
 ; O0-NEXT:    adcq $0, %rcx
 ; O0-NEXT:  .Lpcsection571:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection572:
 ; O0-NEXT:    jne .LBB223_1
 ; O0-NEXT:    jmp .LBB223_2
@@ -13183,8 +13163,8 @@ define void @atomic128_sub_release(ptr %a) {
 ; O0-NEXT:    jmp .LBB224_1
 ; O0-NEXT:  .LBB224_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection576:
@@ -13194,8 +13174,8 @@ define void @atomic128_sub_release(ptr %a) {
 ; O0-NEXT:    adcq $-1, %rcx
 ; O0-NEXT:  .Lpcsection578:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection579:
 ; O0-NEXT:    jne .LBB224_1
 ; O0-NEXT:    jmp .LBB224_2
@@ -13316,8 +13296,8 @@ define void @atomic128_and_release(ptr %a) {
 ; O0-NEXT:    jmp .LBB225_1
 ; O0-NEXT:  .LBB225_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection583:
@@ -13329,8 +13309,8 @@ define void @atomic128_and_release(ptr %a) {
 ; O0-NEXT:    # kill: def $rcx killed $ecx
 ; O0-NEXT:  .Lpcsection586:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection587:
 ; O0-NEXT:    jne .LBB225_1
 ; O0-NEXT:    jmp .LBB225_2
@@ -13448,8 +13428,8 @@ define void @atomic128_or_release(ptr %a) {
 ; O0-NEXT:    jmp .LBB226_1
 ; O0-NEXT:  .LBB226_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection591:
@@ -13457,8 +13437,8 @@ define void @atomic128_or_release(ptr %a) {
 ; O0-NEXT:    movq %rcx, %rdx
 ; O0-NEXT:  .Lpcsection592:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection593:
 ; O0-NEXT:    jne .LBB226_1
 ; O0-NEXT:    jmp .LBB226_2
@@ -13573,8 +13553,8 @@ define void @atomic128_xor_release(ptr %a) {
 ; O0-NEXT:    jmp .LBB227_1
 ; O0-NEXT:  .LBB227_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection597:
@@ -13582,8 +13562,8 @@ define void @atomic128_xor_release(ptr %a) {
 ; O0-NEXT:    movq %rcx, %rdx
 ; O0-NEXT:  .Lpcsection598:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection599:
 ; O0-NEXT:    jne .LBB227_1
 ; O0-NEXT:    jmp .LBB227_2
@@ -13698,8 +13678,8 @@ define void @atomic128_nand_release(ptr %a) {
 ; O0-NEXT:    jmp .LBB228_1
 ; O0-NEXT:  .LBB228_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection603:
@@ -13713,8 +13693,8 @@ define void @atomic128_nand_release(ptr %a) {
 ; O0-NEXT:    movq $-1, %rcx
 ; O0-NEXT:  .Lpcsection607:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection608:
 ; O0-NEXT:    jne .LBB228_1
 ; O0-NEXT:    jmp .LBB228_2
@@ -13838,8 +13818,8 @@ define void @atomic128_xchg_acq_rel(ptr %a) {
 ; O0-NEXT:    jmp .LBB229_1
 ; O0-NEXT:  .LBB229_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:  .Lpcsection612:
 ; O0-NEXT:    xorl %ecx, %ecx
@@ -13849,8 +13829,8 @@ define void @atomic128_xchg_acq_rel(ptr %a) {
 ; O0-NEXT:    movl $42, %ebx
 ; O0-NEXT:  .Lpcsection615:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection616:
 ; O0-NEXT:    jne .LBB229_1
 ; O0-NEXT:    jmp .LBB229_2
@@ -13965,8 +13945,8 @@ define void @atomic128_add_acq_rel(ptr %a) {
 ; O0-NEXT:    jmp .LBB230_1
 ; O0-NEXT:  .LBB230_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection620:
@@ -13976,8 +13956,8 @@ define void @atomic128_add_acq_rel(ptr %a) {
 ; O0-NEXT:    adcq $0, %rcx
 ; O0-NEXT:  .Lpcsection622:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection623:
 ; O0-NEXT:    jne .LBB230_1
 ; O0-NEXT:    jmp .LBB230_2
@@ -14098,8 +14078,8 @@ define void @atomic128_sub_acq_rel(ptr %a) {
 ; O0-NEXT:    jmp .LBB231_1
 ; O0-NEXT:  .LBB231_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection627:
@@ -14109,8 +14089,8 @@ define void @atomic128_sub_acq_rel(ptr %a) {
 ; O0-NEXT:    adcq $-1, %rcx
 ; O0-NEXT:  .Lpcsection629:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection630:
 ; O0-NEXT:    jne .LBB231_1
 ; O0-NEXT:    jmp .LBB231_2
@@ -14231,8 +14211,8 @@ define void @atomic128_and_acq_rel(ptr %a) {
 ; O0-NEXT:    jmp .LBB232_1
 ; O0-NEXT:  .LBB232_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection634:
@@ -14244,8 +14224,8 @@ define void @atomic128_and_acq_rel(ptr %a) {
 ; O0-NEXT:    # kill: def $rcx killed $ecx
 ; O0-NEXT:  .Lpcsection637:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection638:
 ; O0-NEXT:    jne .LBB232_1
 ; O0-NEXT:    jmp .LBB232_2
@@ -14363,8 +14343,8 @@ define void @atomic128_or_acq_rel(ptr %a) {
 ; O0-NEXT:    jmp .LBB233_1
 ; O0-NEXT:  .LBB233_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection642:
@@ -14372,8 +14352,8 @@ define void @atomic128_or_acq_rel(ptr %a) {
 ; O0-NEXT:    movq %rcx, %rdx
 ; O0-NEXT:  .Lpcsection643:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection644:
 ; O0-NEXT:    jne .LBB233_1
 ; O0-NEXT:    jmp .LBB233_2
@@ -14488,8 +14468,8 @@ define void @atomic128_xor_acq_rel(ptr %a) {
 ; O0-NEXT:    jmp .LBB234_1
 ; O0-NEXT:  .LBB234_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection648:
@@ -14497,8 +14477,8 @@ define void @atomic128_xor_acq_rel(ptr %a) {
 ; O0-NEXT:    movq %rcx, %rdx
 ; O0-NEXT:  .Lpcsection649:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection650:
 ; O0-NEXT:    jne .LBB234_1
 ; O0-NEXT:    jmp .LBB234_2
@@ -14613,8 +14593,8 @@ define void @atomic128_nand_acq_rel(ptr %a) {
 ; O0-NEXT:    jmp .LBB235_1
 ; O0-NEXT:  .LBB235_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection654:
@@ -14628,8 +14608,8 @@ define void @atomic128_nand_acq_rel(ptr %a) {
 ; O0-NEXT:    movq $-1, %rcx
 ; O0-NEXT:  .Lpcsection658:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection659:
 ; O0-NEXT:    jne .LBB235_1
 ; O0-NEXT:    jmp .LBB235_2
@@ -14753,8 +14733,8 @@ define void @atomic128_xchg_seq_cst(ptr %a) {
 ; O0-NEXT:    jmp .LBB236_1
 ; O0-NEXT:  .LBB236_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:  .Lpcsection663:
 ; O0-NEXT:    xorl %ecx, %ecx
@@ -14764,8 +14744,8 @@ define void @atomic128_xchg_seq_cst(ptr %a) {
 ; O0-NEXT:    movl $42, %ebx
 ; O0-NEXT:  .Lpcsection666:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection667:
 ; O0-NEXT:    jne .LBB236_1
 ; O0-NEXT:    jmp .LBB236_2
@@ -14880,8 +14860,8 @@ define void @atomic128_add_seq_cst(ptr %a) {
 ; O0-NEXT:    jmp .LBB237_1
 ; O0-NEXT:  .LBB237_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection671:
@@ -14891,8 +14871,8 @@ define void @atomic128_add_seq_cst(ptr %a) {
 ; O0-NEXT:    adcq $0, %rcx
 ; O0-NEXT:  .Lpcsection673:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection674:
 ; O0-NEXT:    jne .LBB237_1
 ; O0-NEXT:    jmp .LBB237_2
@@ -15013,8 +14993,8 @@ define void @atomic128_sub_seq_cst(ptr %a) {
 ; O0-NEXT:    jmp .LBB238_1
 ; O0-NEXT:  .LBB238_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection678:
@@ -15024,8 +15004,8 @@ define void @atomic128_sub_seq_cst(ptr %a) {
 ; O0-NEXT:    adcq $-1, %rcx
 ; O0-NEXT:  .Lpcsection680:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection681:
 ; O0-NEXT:    jne .LBB238_1
 ; O0-NEXT:    jmp .LBB238_2
@@ -15146,8 +15126,8 @@ define void @atomic128_and_seq_cst(ptr %a) {
 ; O0-NEXT:    jmp .LBB239_1
 ; O0-NEXT:  .LBB239_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection685:
@@ -15159,8 +15139,8 @@ define void @atomic128_and_seq_cst(ptr %a) {
 ; O0-NEXT:    # kill: def $rcx killed $ecx
 ; O0-NEXT:  .Lpcsection688:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection689:
 ; O0-NEXT:    jne .LBB239_1
 ; O0-NEXT:    jmp .LBB239_2
@@ -15278,8 +15258,8 @@ define void @atomic128_or_seq_cst(ptr %a) {
 ; O0-NEXT:    jmp .LBB240_1
 ; O0-NEXT:  .LBB240_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection693:
@@ -15287,8 +15267,8 @@ define void @atomic128_or_seq_cst(ptr %a) {
 ; O0-NEXT:    movq %rcx, %rdx
 ; O0-NEXT:  .Lpcsection694:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection695:
 ; O0-NEXT:    jne .LBB240_1
 ; O0-NEXT:    jmp .LBB240_2
@@ -15403,8 +15383,8 @@ define void @atomic128_xor_seq_cst(ptr %a) {
 ; O0-NEXT:    jmp .LBB241_1
 ; O0-NEXT:  .LBB241_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movq %rax, %rbx
 ; O0-NEXT:  .Lpcsection699:
@@ -15412,8 +15392,8 @@ define void @atomic128_xor_seq_cst(ptr %a) {
 ; O0-NEXT:    movq %rcx, %rdx
 ; O0-NEXT:  .Lpcsection700:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection701:
 ; O0-NEXT:    jne .LBB241_1
 ; O0-NEXT:    jmp .LBB241_2
@@ -15528,8 +15508,8 @@ define void @atomic128_nand_seq_cst(ptr %a) {
 ; O0-NEXT:    jmp .LBB242_1
 ; O0-NEXT:  .LBB242_1: # %atomicrmw.start
 ; O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; O0-NEXT:    movl %eax, %ecx
 ; O0-NEXT:  .Lpcsection705:
@@ -15543,8 +15523,8 @@ define void @atomic128_nand_seq_cst(ptr %a) {
 ; O0-NEXT:    movq $-1, %rcx
 ; O0-NEXT:  .Lpcsection709:
 ; O0-NEXT:    lock cmpxchg16b (%rsi)
-; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; O0-NEXT:  .Lpcsection710:
 ; O0-NEXT:    jne .LBB242_1
 ; O0-NEXT:    jmp .LBB242_2

diff  --git a/llvm/test/CodeGen/X86/pr15705.ll b/llvm/test/CodeGen/X86/pr15705.ll
index bbbeb53764686..3dd4aabe25b57 100644
--- a/llvm/test/CodeGen/X86/pr15705.ll
+++ b/llvm/test/CodeGen/X86/pr15705.ll
@@ -5,19 +5,18 @@
 define i32 @PR15705(i32 %x, i32 %a, i32 %b, i32 %c) #0 {
 ; X86-LABEL: PR15705:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    je .LBB0_4
+; X86-NEXT:  # %bb.1: # %if.end
 ; X86-NEXT:    cmpl %eax, %edx
-; X86-NEXT:    jne .LBB0_2
-; X86-NEXT:  # %bb.1:
+; X86-NEXT:    jne .LBB0_3
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:  .LBB0_3: # %if.end
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_2: # %if.end
-; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    jne .LBB0_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:  .LBB0_4: # %return
 ; X86-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/pr32256.ll b/llvm/test/CodeGen/X86/pr32256.ll
index 09f7d92c1db2e..225a3af551a2c 100644
--- a/llvm/test/CodeGen/X86/pr32256.ll
+++ b/llvm/test/CodeGen/X86/pr32256.ll
@@ -9,12 +9,12 @@ define void @_Z1av() {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl $2, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 6
-; CHECK-NEXT:    movb c, %al
-; CHECK-NEXT:    xorb $-1, %al
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    movb c, %cl
+; CHECK-NEXT:    xorb $-1, %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    movb %al, (%esp) # 1-byte Spill
 ; CHECK-NEXT:    jne .LBB0_1
 ; CHECK-NEXT:    jmp .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: # %land.rhs

diff  --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll
index 02dbe4f545fe5..c3c96e8228797 100644
--- a/llvm/test/CodeGen/X86/pr38795.ll
+++ b/llvm/test/CodeGen/X86/pr38795.ll
@@ -33,7 +33,6 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_15: # %for.inc
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:  .LBB0_1: # %for.cond
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
@@ -70,6 +69,7 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
 ; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload
 ; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    # implicit-def: $eax
 ; CHECK-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    jne .LBB0_15
@@ -119,7 +119,7 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    jne .LBB0_9
 ; CHECK-NEXT:  # %bb.12: # %if.end26
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testb %dh, %dh
 ; CHECK-NEXT:    je .LBB0_15
 ; CHECK-NEXT:  # %bb.13: # %if.end26
@@ -128,7 +128,7 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    jne .LBB0_15
 ; CHECK-NEXT:  # %bb.14: # %if.then31
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    xorl %ebp, %ebp
 ; CHECK-NEXT:    jmp .LBB0_15
 ; CHECK-NEXT:    .p2align 4
@@ -279,34 +279,31 @@ define void @verifier_error_reduced_issue38788(i1 %cmp11) {
 ; CHECK-NEXT:    je .LBB1_3
 ; CHECK-NEXT:  # %bb.2: # in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movl %ebx, %edx
 ; CHECK-NEXT:    jmp .LBB1_5
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_3: # %if.end
 ; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    testb $1, %al
 ; CHECK-NEXT:    je .LBB1_4
 ; CHECK-NEXT:  # %bb.9: # %if.then13
 ; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    testb $1, %al
 ; CHECK-NEXT:    movl %ebx, %eax
 ; CHECK-NEXT:    movl $0, %ebx
 ; CHECK-NEXT:    jne .LBB1_8
-; CHECK-NEXT:  # %bb.10: # %for.cond35
-; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    movl %ebx, %edx
 ; CHECK-NEXT:    jmp .LBB1_5
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_4: # in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:  .LBB1_5: # %if.end26
 ; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    testb %cl, %cl
 ; CHECK-NEXT:    je .LBB1_7
 ; CHECK-NEXT:  # %bb.6: # %if.end26
 ; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    movl %ebx, %ecx
 ; CHECK-NEXT:    jmp .LBB1_7
 entry:
   br label %for.cond

diff  --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll
index 1a7551f6117e8..173c41140ebef 100644
--- a/llvm/test/CodeGen/X86/pr49451.ll
+++ b/llvm/test/CodeGen/X86/pr49451.ll
@@ -18,15 +18,15 @@ define void @func_6(i8 %uc_8, i64 %uli_10) nounwind {
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB0_1: # %for.body612
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    testb %bl, %bl
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    je .LBB0_2
 ; X86-NEXT:  # %bb.3: # %if.end1401
 ; X86-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    movw %si, s_2
-; X86-NEXT:    movw %dx, s_0
+; X86-NEXT:    movw %bx, s_0
 ; X86-NEXT:    incl %ecx
-; X86-NEXT:    incl %edx
+; X86-NEXT:    incl %ebx
 ; X86-NEXT:    cmpw $73, %cx
 ; X86-NEXT:    jl .LBB0_1
 ; X86-NEXT:  # %bb.4: # %for.body1703

diff  --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index 8df90a935314d..b5b80515fc6d9 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -21,7 +21,7 @@ define i32 @PR63108() {
 ; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    testb %al, %al
-; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm1
 ; SSE-NEXT:    jne .LBB0_3
 ; SSE-NEXT:  # %bb.4: # %middle.block
 ; SSE-NEXT:    pxor %xmm2, %xmm0

diff  --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 7cfc88a77dea5..fe71a16039c19 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -151,27 +151,27 @@ define dso_local i32 @sad_32i8() nounwind {
 ; SSE2-LABEL: sad_32i8:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    .p2align 4
 ; SSE2-NEXT:  .LBB1_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm3
 ; SSE2-NEXT:    psadbw b+1024(%rax), %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa a+1040(%rax), %xmm3
 ; SSE2-NEXT:    psadbw b+1040(%rax), %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm2
 ; SSE2-NEXT:    addq $32, %rax
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -356,9 +356,9 @@ define dso_local i32 @sad_avx64i8() nounwind {
 ; AVX1-LABEL: sad_avx64i8:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    .p2align 4
 ; AVX1-NEXT:  .LBB2_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -408,26 +408,26 @@ define dso_local i32 @sad_avx64i8() nounwind {
 ; AVX2-LABEL: sad_avx64i8:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    .p2align 4
 ; AVX2-NEXT:  .LBB2_1: # %vector.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX2-NEXT:    vmovdqa a+1024(%rax), %ymm3
 ; AVX2-NEXT:    vpsadbw b+1024(%rax), %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    vmovdqa a+1056(%rax), %ymm3
 ; AVX2-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    addq $64, %rax
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # %bb.2: # %middle.block
-; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm3
-; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index b4a6960d144e1..325f735b09cd9 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1266,13 +1266,14 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 ; X86-SSE-LABEL: add_ss_mask:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    je .LBB70_2
-; X86-SSE-NEXT:  # %bb.1:
-; X86-SSE-NEXT:    addss %xmm0, %xmm1
-; X86-SSE-NEXT:    movaps %xmm1, %xmm2
-; X86-SSE-NEXT:  .LBB70_2:
+; X86-SSE-NEXT:    jne .LBB70_1
+; X86-SSE-NEXT:  # %bb.2:
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; X86-SSE-NEXT:    retl
+; X86-SSE-NEXT:  .LBB70_1:
+; X86-SSE-NEXT:    addss %xmm0, %xmm1
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: add_ss_mask:
 ; X86-AVX1:       # %bb.0:
@@ -1295,13 +1296,14 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 ; X64-SSE-LABEL: add_ss_mask:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    testb $1, %dil
-; X64-SSE-NEXT:    je .LBB70_2
-; X64-SSE-NEXT:  # %bb.1:
-; X64-SSE-NEXT:    addss %xmm0, %xmm1
-; X64-SSE-NEXT:    movaps %xmm1, %xmm2
-; X64-SSE-NEXT:  .LBB70_2:
+; X64-SSE-NEXT:    jne .LBB70_1
+; X64-SSE-NEXT:  # %bb.2:
 ; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; X64-SSE-NEXT:    retq
+; X64-SSE-NEXT:  .LBB70_1:
+; X64-SSE-NEXT:    addss %xmm0, %xmm1
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: add_ss_mask:
 ; X64-AVX1:       # %bb.0:
@@ -1334,13 +1336,14 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X86-SSE-LABEL: add_sd_mask:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    je .LBB71_2
-; X86-SSE-NEXT:  # %bb.1:
-; X86-SSE-NEXT:    addsd %xmm0, %xmm1
-; X86-SSE-NEXT:    movapd %xmm1, %xmm2
-; X86-SSE-NEXT:  .LBB71_2:
+; X86-SSE-NEXT:    jne .LBB71_1
+; X86-SSE-NEXT:  # %bb.2:
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; X86-SSE-NEXT:    retl
+; X86-SSE-NEXT:  .LBB71_1:
+; X86-SSE-NEXT:    addsd %xmm0, %xmm1
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: add_sd_mask:
 ; X86-AVX1:       # %bb.0:
@@ -1363,13 +1366,14 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X64-SSE-LABEL: add_sd_mask:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    testb $1, %dil
-; X64-SSE-NEXT:    je .LBB71_2
-; X64-SSE-NEXT:  # %bb.1:
-; X64-SSE-NEXT:    addsd %xmm0, %xmm1
-; X64-SSE-NEXT:    movapd %xmm1, %xmm2
-; X64-SSE-NEXT:  .LBB71_2:
+; X64-SSE-NEXT:    jne .LBB71_1
+; X64-SSE-NEXT:  # %bb.2:
 ; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; X64-SSE-NEXT:    retq
+; X64-SSE-NEXT:  .LBB71_1:
+; X64-SSE-NEXT:    addsd %xmm0, %xmm1
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: add_sd_mask:
 ; X64-AVX1:       # %bb.0:

diff  --git a/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll b/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll
index 089ec7ae7658e..391963de79703 100644
--- a/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll
+++ b/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll
@@ -60,7 +60,7 @@ zero:
 ; CHECK:        JMP_1 %bb.4
 ; CHECK:      bb.4
 ; CHECK:      bb.5
-; CHECK:        %20:gr64 = COPY %10
+; CHECK:        %3:gr64 = COPY %10
 ; CHECK:        %4:gr64 = COPY killed %10
 ; CHECK:        %4:gr64 = nuw ADD64ri32 %4, 8, implicit-def dead $eflags
 ; CHECK:        TEST64rr killed %1, %1, implicit-def $eflags

diff  --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 473cc9d152627..5699c447baf41 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -426,19 +426,18 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-O0-NEXT:    jmp LBB4_1
 ; CHECK-O0-NEXT:  LBB4_1: ## %bb_loop
 ; CHECK-O0-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-O0-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; CHECK-O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-O0-NEXT:    cmpl $0, %eax
+; CHECK-O0-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 4-byte Reload
+; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; CHECK-O0-NEXT:    cmpl $0, %ecx
+; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-O0-NEXT:    je LBB4_3
 ; CHECK-O0-NEXT:  ## %bb.2: ## %gen_error
 ; CHECK-O0-NEXT:    ## in Loop: Header=BB4_1 Depth=1
 ; CHECK-O0-NEXT:    movl $16, %edi
 ; CHECK-O0-NEXT:    callq _malloc
 ; CHECK-O0-NEXT:    movq %rax, %rcx
-; CHECK-O0-NEXT:    movq %rcx, %rax
-; CHECK-O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-O0-NEXT:    movb $1, 8(%rax)
+; CHECK-O0-NEXT:    movb $1, 8(%rcx)
+; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-O0-NEXT:  LBB4_3: ## %bb_cont
 ; CHECK-O0-NEXT:    ## in Loop: Header=BB4_1 Depth=1
 ; CHECK-O0-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
@@ -965,17 +964,17 @@ define void @swifterror_isel(ptr) {
 ; CHECK-O0-NEXT:    ## implicit-def: $r12
 ; CHECK-O0-NEXT:    jne LBB8_2
 ; CHECK-O0-NEXT:  LBB8_1: ## =>This Inner Loop Header: Depth=1
-; CHECK-O0-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax ## 2-byte Reload
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
+; CHECK-O0-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax ## 2-byte Reload
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 ## 8-byte Reload
 ; CHECK-O0-NEXT:    ## implicit-def: $edi
 ; CHECK-O0-NEXT:    movw %ax, %di
 ; CHECK-O0-NEXT:    ## implicit-def: $rax
 ; CHECK-O0-NEXT:    callq *%rax
-; CHECK-O0-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-O0-NEXT:    ## implicit-def: $rax
 ; CHECK-O0-NEXT:    movw (%rax), %ax
 ; CHECK-O0-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-O0-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-O0-NEXT:    jmp LBB8_1
 ; CHECK-O0-NEXT:  LBB8_2:
 ; CHECK-O0-NEXT:    addq $40, %rsp

diff  --git a/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir b/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
index 31c694ee1e014..ed04647c84066 100644
--- a/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
+++ b/llvm/test/DebugInfo/MIR/InstrRef/phi-regallocd-to-stack.mir
@@ -70,7 +70,8 @@ frameInfo:
 machineFunctionInfo: {}
 body:             |
   ; CHECK-LABEL: bb.0:
-  ; CHECK:       MOV32mr %stack.[[SLOT:.+]], 1, $noreg, 0, $noreg, $edi
+  ; CHECK:       renamable $ebp = COPY $edi
+  ; CHECK:       MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $ebp
   bb.0:
     successors: %bb.2(0x50000000), %bb.1(0x30000000)
     liveins: $edi, $esi
@@ -141,8 +142,8 @@ body:             |
     %64:gr32 = PHI %24, %bb.0, %44, %bb.1, debug-location !18
 
     DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !12
-    ; CHECK:      DBG_PHI %stack.[[SLOT]], 1, 32
-    ; CHECK:      renamable $eax = MOV32rm %stack.[[SLOT]],
+    ; CHECK:      DBG_PHI %stack.1, 1, 32
+    ; CHECK:      renamable $eax = MOV32rm %stack.1,
     ; CHECK:      DBG_INSTR_REF {{.+}}, dbg-instr-ref(1, 0)
     $eax = COPY killed %0, debug-location !19
     RET 0, killed $eax, debug-location !19

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
index f1e443b701bbe..5976658ccdf86 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
@@ -10,22 +10,23 @@ define i32 @i32_initially_postidx(ptr %p, i64 %n) {
 ; CHECK-NEXT:    cmp x1, #1
 ; CHECK-NEXT:    b.lt .LBB0_5
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov x8, x0
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr w9, [x8], #4
-; CHECK-NEXT:    add w0, w0, w9
-; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    ldr w9, [x0], #4
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    b.lo .LBB0_5
 ; CHECK-NEXT:  // %bb.3: // %for.inc
 ; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    subs x1, x1, #1
 ; CHECK-NEXT:    b.ne .LBB0_2
 ; CHECK-NEXT:  // %bb.4: // %cleanup
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_5:
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
 entry:
   %cmp1 = icmp sgt i64 %n, 0
@@ -57,22 +58,23 @@ define i32 @i32_initially_offset(ptr %p, i64 %n) {
 ; CHECK-NEXT:    cmp x1, #1
 ; CHECK-NEXT:    b.lt .LBB1_5
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov x8, x0
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:  .LBB1_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr w9, [x8], #4
-; CHECK-NEXT:    add w0, w0, w9
-; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    ldr w9, [x0], #4
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    b.lo .LBB1_5
 ; CHECK-NEXT:  // %bb.3: // %for.cond
 ; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    subs x1, x1, #1
 ; CHECK-NEXT:    b.ne .LBB1_2
 ; CHECK-NEXT:  // %bb.4: // %cleanup
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_5:
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
 entry:
   %cmp1 = icmp sgt i64 %n, 0

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
index 1f7a5b12d16b8..7353acd7228cd 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll
@@ -7,23 +7,27 @@ target triple = "riscv64-unknown-linux-gnu"
 define ptr @foo(ptr %a0, ptr %a1, i64 %a2) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    mv a3, a0
 ; CHECK-NEXT:    vsetvli a4, a2, e8, m8, ta, ma
-; CHECK-NEXT:    beq a4, a2, .LBB0_4
-; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    bne a4, a2, .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a1)
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: # %if.then
 ; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    sub a5, a2, a4
-; CHECK-NEXT:  .LBB0_2: # %do.body
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_3: # %do.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vse8.v v8, (a3)
 ; CHECK-NEXT:    add a3, a3, a4
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    bltu a3, a5, .LBB0_2
-; CHECK-NEXT:  # %bb.3: # %do.end
+; CHECK-NEXT:    bltu a3, a5, .LBB0_3
+; CHECK-NEXT:  # %bb.4: # %do.end
 ; CHECK-NEXT:    sub a2, a2, a3
 ; CHECK-NEXT:    vsetvli a2, a2, e8, m8, ta, ma
-; CHECK-NEXT:  .LBB0_4: # %if.end
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vse8.v v8, (a3)


        


More information about the llvm-commits mailing list